In [0]:
dbutils.widgets.text("api_key","")
x_api_key = dbutils.widgets.get("api_key")

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS air_quality

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS air_quality.openaq

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS air_quality.openaq.ingestion

In [0]:
%sql
CREATE TABLE IF NOT EXISTS air_quality.openaq.ingestion_meta_data(
  source STRING,
  last_processed_timestamp TIMESTAMP
)
USING DELTA;

In [0]:
%sql
INSERT INTO air_quality.openaq.ingestion_meta_data VALUES ('openaq', TIMESTAMP('2025-12-22 00:00:00'))


In [0]:
metadata_df = spark.table("air_quality.openaq.ingestion_meta_data") \
                   .filter("source = 'openaq'")
metadata_df.show()


Incremental Data Load using Rest API

In [0]:
%python
pip install geopy


In [0]:
import requests, json
from datetime import datetime
from pyspark.sql.functions import *

In [0]:
headers = {
    "X-API-Key" : x_api_key
}

url = "https://api.openaq.org/v3/locations"
response = requests.get(url , headers = headers)

locations_data = response.json()


In [0]:
len(locations_data)


In [0]:
locations = locations_data.get("results", [])

filtered_locations = [
    loc for loc in locations
    if loc.get("country")["name"] == "India"
]
filtered_locations

In [0]:
# Adding ingestiontime column to the records
from datetime import datetime, timezone

current_ts = datetime.now(timezone.utc).isoformat()
raw_payload = {
    "source" : "openaq",
    "ingestiontime" : current_ts,
    "results" : filtered_locations
}

display(raw_payload)

In [0]:
# Writing Data to Volume

current_ts = datetime.now(timezone.utc)

output_path = (
    f"/Volumes/air_quality/openaq/ingestion/locations/"
    f"{current_ts.strftime('%Y/%m/%d/%H')}"
)

output_path

file_name = f"locations_{current_ts.strftime('%Y%m%d%H%M%S%f')}.json"

dbutils.fs.put(
    f"{output_path}/{file_name}",
    json.dumps(raw_payload),
    overwrite = False
)

In [0]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderServiceError
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="my_geopy_app")

reverse = RateLimiter(
    geolocator.reverse,
    min_delay_seconds=1,   # REQUIRED by Nominatim
    max_retries=2
)

def city(lat,long):
    location = reverse((lat, long), zoom= 10 , language = "en")
    
    if location and "address" in location.raw:
        addr = location.raw["address"]
        return (
            addr.get("city")
        )
    return None

cache= {}
for record in filtered_locations:
    coords = record.get("coordinates")
    if not coords:
        record["city"] = None
        continue

    key = (round(coords["latitude"], 5), round(coords["longitude"], 5))

    if key not in cache:
        cache[key] = city(coords["latitude"], coords["longitude"])

    record["city"] = cache[key]

    display(record)
