In [0]:
# 1. Define the stages of the Medallion Architecture data pipeline
tiers = ["bronze", "silver", "gold"]

# 2. Automatically generate the storage URLs for each tier.
# This creates a dictionary where each tier is a key, and the Azure storage path is the value.
# The 'f' before the string allows us to plug the {tier} variable directly into the URL.
adls_paths = {tier: f"abfss://{tier}@earthquakestorageaccount.dfs.core.windows.net/" for tier in tiers}
adls_paths


# 3. Pull the specific URLs out of our dictionary and save them to easy-to-use variables.
# This is like taking a long address from a directory and writing it on a sticky note.
bronze_adls = adls_paths["bronze"]
silver_adls = adls_paths["silver"]
gold_adls = adls_paths["gold"] 

# 4. Use Databricks Utilities (dbutils) to list the files in each folder.
# This confirms that the connection to Azure is working and shows you what data is available.
dbutils.fs.ls(bronze_adls)
dbutils.fs.ls(silver_adls)
dbutils.fs.ls(gold_adls)

In [0]:
# 1. 'requests' is used to make HTTP calls. 
# It's essentially the "web browser" for your code so it can talk to the Earthquake API.
import requests
# 2. 'json' helps Python read and write the JSON data format.
# Since the Earthquake API sends data as JSON, we need this to translate it into a Python dictionary.
import json
# 3. 'datetime' allows us to work with dates and times.
# 'date' handles specific days, and 'timedelta' allows us to do "date math" (like Today - 1 day).
from datetime import date, timedelta

In [0]:
start_date = date.today() - timedelta(1)
end_date = date.today()
start_date, end_date

In [0]:
# 1. Build the specific "Order Form" for the USGS website.
# We use an f-string to plug our dates and the 'geojson' format into the address.
url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_date}&endtime={end_date}"

try:
    # 2. Go to the website and grab the data (the GET request).
    response = requests.get(url)

    # 3. Safety Check: If the website is down or the URL is wrong, 
    # this will stop the code and tell us exactly what went wrong (e.g., 404 Not Found).
    response.raise_for_status() 

    # 4. Extract the 'features'. 
    # The API sends back metadata (extra info) we don't need; '.get('features')' 
    # grabs only the list of actual earthquake events.
    data = response.json().get('features', [])

    if not data:
        # If there were zero earthquakes yesterday, we don't want to save an empty file.
        print("No data returned for the specified date range.")
    else:
        # 5. Define the destination. 
        # We name the file using the date so we don't overwrite yesterday's data.
        file_path = f"{bronze_adls}/{start_date}_earthquake_data.json"

        # 6. Make it "Human Readable".
        # 'indent=4' adds spaces and line breaks so the file isn't just one long, impossible-to-read line.
        json_data = json.dumps(data, indent=4)
        
        # 7. Write to the Cloud.
        # 'dbutils.fs.put' sends the data to your 'Bronze' storage in Azure.
        dbutils.fs.put(file_path, json_data, overwrite=True)
        print(f"Data successfully saved to {file_path}")

except requests.exceptions.RequestException as e:
    # 8. Error Handling: This catches internet connection issues or API timeouts.
    print(f"Error fetching data from API: {e}")

In [0]:
data[0]

In [0]:
# 1. Bundle all our important variables into a single "packet" (a dictionary).
# Instead of passing items one by one, we put them all in this 'output_data' container.

output_data = {
    "start_date": start_date.isoformat(),
    "end_date": end_date.isoformat(),
    "bronze_adls": bronze_adls,
    "silver_adls": silver_adls,
    "gold_adls": gold_adls
}

#serialized the json format
"""

# Serialize the dictionary to a JSON string
output_json = json.dumps(output_data)

# Log the serialized JSON for debugging
print(f"Serialized JSON: {output_json}")

# Return the JSON string
dbutils.notebook.exit(output_json)

"""

# 2. Use the Databricks "Task Values" utility to broadcast this information.
# This saves the 'output_data' under the name "bronze_output" so that 
# subsequent tasks in the Databricks Job can read and use these values.


dbutils.jobs.taskValues.set(key="bronze_output", value=output_data) 