In [0]:
bronze_base = "/Volumes/aqi_etl_pipeline/bronze_aqi/bronze_vol_aqi/aqi/"


# ==========================================================
# STEP 2 — Find latest YEAR
# ==========================================================
years = [item.name.replace("/", "") for item in dbutils.fs.ls(bronze_base)]
years_int = [int(y) for y in years if y.isdigit()]
latest_year = str(max(years_int))

print("Latest Year:", latest_year)


# ==========================================================
# STEP 3 — Find latest MONTH
# ==========================================================
months = [item.name.replace("/", "") for item in dbutils.fs.ls(f"{bronze_base}/{latest_year}")]
months_int = [int(m) for m in months if m.isdigit()]
latest_month = str(max(months_int)).zfill(2)

print("Latest Month:", latest_month)


# ==========================================================
# STEP 4 — Find latest DAY
# ==========================================================
days = [item.name.replace("/", "") for item in dbutils.fs.ls(f"{bronze_base}/{latest_year}/{latest_month}")]
days_int = [int(d) for d in days if d.isdigit()]
latest_day = str(max(days_int)).zfill(2)

print("Latest Day:", latest_day)


# ==========================================================
# STEP 5 — Find latest HOUR
# ==========================================================
hours = [item.name.replace("/", "") for item in dbutils.fs.ls(f"{bronze_base}/{latest_year}/{latest_month}/{latest_day}")]
hours_int = [int(h) for h in hours if h.isdigit()]
latest_hour = str(max(hours_int)).zfill(2)

print("Latest Hour:", latest_hour)


# ==========================================================
# STEP 6 — Find latest FILE inside that hour folder
# ==========================================================
files = dbutils.fs.ls(f"{bronze_base}/{latest_year}/{latest_month}/{latest_day}/{latest_hour}")

# Extract epoch from file names: weather_<epoch>.json
file_epochs = []
file_map = {}  # epoch → full path

for f in files:
    name = f.name  # example: weather_1763967555.json
    if name.startswith("weather_") and name.endswith(".json"):
        epoch = int(name.replace("weather_", "").replace(".json", ""))
        file_epochs.append(epoch)
        file_map[epoch] = f.path

latest_epoch = max(file_epochs)
latest_file_path = file_map[latest_epoch]

print("Latest File:", latest_file_path)

In [0]:
df_bronze = spark.read.json(latest_file_path)

print("Bronze schema:")
df_bronze.printSchema()

In [0]:
from pyspark.sql import functions as F

df_flat = df_bronze.select(
    F.col("data.aqi").alias("AQI"),
    F.col("data.city.name").alias("City"),
    F.col("data.city.geo").getItem(0).alias("latitude"),
    F.col("data.city.geo").getItem(1).alias("longitude"),
    F.col("data.time.s").alias("timestamp"),
    F.col("ingested_at").alias("ingested_at")
)

display(df_flat)

In [0]:
from pyspark.sql import functions as F

df_silver = (
    df_flat
    
    # Convert "timestamp" from string to IST timestamp
    .withColumn("aqi_timestamp_ist", 
        F.to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss")
    )
    
    # Convert IST → UTC (IST is +5:30 ahead)
    .withColumn("aqi_timestamp_utc", 
        F.expr("aqi_timestamp_ist - INTERVAL 5 HOURS 30 MINUTES")
    )
    
    # Convert ingested_at (UTC string) → Timestamp
    .withColumn("ingested_at_utc", 
        F.to_timestamp("ingested_at")
    )
    
    # Create IST version for ingestion time
    .withColumn("ingested_at_ist", 
        F.expr("ingested_at_utc + INTERVAL 5 HOURS 30 MINUTES")
    )
    
    # Partition-friendly columns
    .withColumn("aqi_date", F.to_date("aqi_timestamp_ist"))
    .withColumn("aqi_hour", F.hour("aqi_timestamp_ist"))
)

display(df_silver)


In [0]:
df_silver.printSchema()
df_silver.display()

In [0]:
from pyspark.sql.utils import AnalysisException

silver_table = "aqi_etl_pipeline.silver.silver_aqi"

# Check table existence safely
try:
    spark.table(silver_table)
    table_exists = True
except AnalysisException:
    table_exists = False

if not table_exists:
    (df_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(silver_table)
    )
    print(f"✔ Created Silver table: {silver_table}")

else:
    (df_silver.write
        .format("delta")
        .mode("append")
        .saveAsTable(silver_table)
    )
    print(f"✔ Appended new AQI data to: {silver_table}")

print("Done! Silver layer updated.")
