In [0]:
container_name = "de-upskilling-weather"
folder_name = "Silver/Updates"

file_names = dbutils.fs.ls(f"/mnt/{container_name}/{folder_name}")

parquet_files = [file_info.name.replace(".parquet/", ".parquet") for file_info in file_names]


for file_name in parquet_files:
    print(file_name)

daily_historical_forecast_update.parquet
daily_historical_update.parquet
hourly_historical_forecast_update.parquet
hourly_historical_update.parquet


In [0]:
# Filter for file names that contain "daily"
daily_files = [file_name for file_name in parquet_files if "daily" in file_name]

# Print the filtered list
print("Parquet file names containing 'daily':")
for file_name in daily_files:
    print(file_name)

Parquet file names containing 'daily':
daily_historical_forecast_update.parquet
daily_historical_update.parquet


In [0]:
df_cities = spark.read.parquet(f"/mnt/{container_name}/Silver/cities_dim.parquet")

df_cities_daily = df_cities.select("city", "country", "latitude", "longitude")

In [0]:
print("Parquet file names containing 'daily':")

for file_name in daily_files:
    
    df_daily = spark.read.parquet(f"/mnt/{container_name}/{folder_name}/{file_name}")
    
    df_daily = df_daily.join(df_cities_daily, on=['latitude', 'longitude'])
    
    print(df_daily.printSchema())
    print(df_daily.count())
    
    
    df_daily.write.mode("overwrite").partitionBy("city", "year").parquet(f"/mnt/{container_name}/Gold/Updates/{file_name}")

Parquet file names containing 'daily':
root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- temperature_2m_max: double (nullable = true)
 |-- temperature_2m_min: double (nullable = true)
 |-- apparent_temperature_max: double (nullable = true)
 |-- apparent_temperature_min: double (nullable = true)
 |-- precipitation_sum: double (nullable = true)
 |-- rain_sum: double (nullable = true)
 |-- showers_sum: double (nullable = true)
 |-- snowfall_sum: double (nullable = true)
 |-- wind_speed_10m_max: double (nullable = true)
 |-- wind_gusts_10m_max: double (nullable = true)
 |-- wind_direction_10m_dominant: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)

None
3225
root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- temperature_2m_max: double (nullable = t

In [0]:
hourly_files = [file_name for file_name in parquet_files if "hourly" in file_name]
hourly_files

['hourly_historical_forecast_update.parquet',
 'hourly_historical_update.parquet']

In [0]:
# write hourly dfs to gold

In [0]:
from pyspark.sql.functions import col, from_unixtime, expr, split
from pyspark.sql import functions as F


for file_name in hourly_files:
    
    df_hourly = spark.read.parquet(f"/mnt/{container_name}/{folder_name}/{file_name}")
    
    df_hourly = df_hourly.join(df_cities, on=['latitude', 'longitude'])
    
    # Combine year, month, day, and time into a single datetime string
    df_with_datetime = df_hourly.withColumn(
        "datetime",
        F.concat_ws(
            " ",
            F.concat_ws("-", F.col("year"), F.lpad(F.col("month"), 2, "0"), F.lpad(F.col("day"), 2, "0")),
            F.col("time")
        )
    )

    # Convert the datetime string to UNIX timestamp
    df_with_unix_time = df_with_datetime.withColumn(
        "Unix_Time",
        F.unix_timestamp(F.col("datetime"), "yyyy-MM-dd HH:mm:ss")
    )

    
    
    # Split the offset into hours and minutes
    local_df = df_with_unix_time.withColumn("offset_seconds",
        (split(col("time_zone"), ":")[0].cast("int") * 3600) +  # Hours to seconds
        (split(col("time_zone"), ":")[1].cast("int") * 60 * expr("sign(split(time_zone, ':')[0])"))  # Minutes to seconds, adjust sign
    )

    # Convert UTC time to local time
    local_df = local_df.withColumn(
        "Local_Time",
        from_unixtime(col("Unix_Time") + col("offset_seconds"))
    )

    local_df.show(5)

    local_df.write.mode("overwrite").partitionBy("city", "year").parquet(f"/mnt/{container_name}/Gold/Updates/{file_name}")

+---------+---------+-----------------+--------------------+-------------+----+-------+--------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+----+-----+----------+-------+---------+-------------------+----------+--------------+-------------------+
| latitude|longitude|   temperature_2m|relative_humidity_2m|precipitation|rain|showers|snowfall|    wind_speed_10m|    wind_speed_80m|   wind_speed_120m|wind_direction_10m|wind_direction_80m|wind_direction_120m|    wind_gusts_10m|    time|day|year|month|      city|country|time_zone|           datetime| Unix_Time|offset_seconds|         Local_Time|
+---------+---------+-----------------+--------------------+-------------+----+-------+--------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+----+-----+----------+-------+---------+-----------------