In [0]:
from pyspark.sql.functions import col, split
from functools import reduce
import os

In [0]:
# List the contents of the directory
parquet_files = os.listdir('/dbfs/mnt/de-upskilling-weather/LandingZone/Updates/')

parquet_files

['daily_historical_forecast_updates.parquet',
 'daily_historical_updates.parquet',
 'hourly_historical_forecast_updates.parquet',
 'hourly_historical_updates.parquet']

In [0]:
if 'daily_historical_forecast_updates.parquet' in parquet_files:
    daily_hist_forecast = spark.read.parquet('/mnt/de-upskilling-weather/LandingZone/Updates/daily_historical_forecast_updates.parquet/')
    
    # removing timestamp
    daily_hist_forecast = daily_hist_forecast.withColumn('date', split(col('date'), ' ')).withColumn('date', col('date')[0])

    # splitting date column
    daily_hist_forecast = daily_hist_forecast.withColumn('date', split(col('date'), '-'))
    daily_hist_forecast = daily_hist_forecast.withColumn('year', col('date')[0]).withColumn('month', col('date')[1]).withColumn('day', col('date')[2])
    daily_hist_forecast = daily_hist_forecast.withColumn('year', col('year').cast('int')).withColumn('month', col('month').cast('int')).withColumn('day', col('day').cast('int'))

    # dropping unnecessary columns
    daily_hist_forecast = daily_hist_forecast.drop('timezone', 'date', 'weather_code')
    
    # dropping nulls in case any still exist
    daily_hist_forecast = daily_hist_forecast.dropna()
    
    # writing to silver updates folder
    daily_hist_forecast.write.mode('overwrite').partitionBy('year').parquet('/mnt/de-upskilling-weather/Silver/Updates/daily_historical_forecast_update.parquet/')

    print('done')
else:
    print('daily_historical_forecast_update.parquet not detected')


done


In [0]:
daily_hist_forecast.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- temperature_2m_max: double (nullable = true)
 |-- temperature_2m_min: double (nullable = true)
 |-- apparent_temperature_max: double (nullable = true)
 |-- apparent_temperature_min: double (nullable = true)
 |-- precipitation_sum: double (nullable = true)
 |-- rain_sum: double (nullable = true)
 |-- showers_sum: double (nullable = true)
 |-- snowfall_sum: double (nullable = true)
 |-- wind_speed_10m_max: double (nullable = true)
 |-- wind_gusts_10m_max: double (nullable = true)
 |-- wind_direction_10m_dominant: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [0]:
display(daily_hist_forecast.orderBy('year'))

latitude,longitude,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,showers_sum,snowfall_sum,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,year,month,day
41.881832,-87.623177,22.649999618530277,17.5,24.9530029296875,17.606016159057617,0.0,0.0,0.0,0.0,14.241628646850586,29.15999984741211,79.35629272460938,2024,9,30
41.881832,-87.623177,11.850000381469728,2.0999999046325684,9.295605659484863,-1.7380342483520508,0.0,0.0,0.0,0.0,15.778515815734863,47.15999984741211,312.47222900390625,2024,10,16
48.137154,11.576124,14.432499885559082,7.182499885559082,13.616413116455078,5.941177368164063,0.0,0.0,0.0,0.0,9.08598804473877,20.15999984741211,15.5809907913208,2024,10,29
36.174465,-86.76796,25.37350082397461,15.273499488830566,24.81708335876465,13.819900512695312,0.0,0.0,0.0,0.0,21.252199172973636,42.83999633789063,15.214619636535645,2024,9,7
41.881832,-87.623177,17.450000762939453,4.25,14.411109924316406,1.5293774604797363,0.0,0.0,0.0,0.0,13.783817291259766,36.0,186.08840942382807,2024,10,24
52.520008,13.404954,12.195499420166016,10.24549961090088,9.51370620727539,7.323617935180664,0.0,0.0,0.0,0.0,18.72345924377441,39.23999786376953,264.41748046875,2024,10,31
25.695215,-80.168933,26.974000930786133,25.424001693725582,30.443866729736328,25.877422332763672,0.0,0.0,0.0,0.0,38.59226608276367,44.27999877929688,19.674787521362305,2024,10,23
25.695215,-80.168933,27.02400016784668,22.874000549316406,30.82868194580078,25.34204864501953,0.0,0.0,0.0,0.0,12.979984283447266,16.19999885559082,331.5571594238281,2024,11,29
35.481918,-97.508469,12.96299934387207,-2.986999988555908,10.68447208404541,-6.585965633392334,0.0,0.0,0.0,0.0,9.359999656677246,18.71999931335449,19.57324981689453,2024,12,1
38.855584,-77.036975,26.886999130249023,14.687000274658203,27.41830825805664,13.854755401611328,0.0,0.0,0.0,0.0,11.304228782653809,21.959999084472656,4.856514930725098,2024,10,5


In [0]:
if 'hourly_historical_forecast_updates.parquet' in parquet_files:
    hourly_hist_forecast = spark.read.parquet('/mnt/de-upskilling-weather/LandingZone/Updates/hourly_historical_forecast_updates.parquet/')
    
    # splitting timestamp and date
    hourly_hist_forecast = hourly_hist_forecast.withColumn('date', split(col('date'), ' ')).withColumn('time', col('date')[1]).withColumn('date', col('date')[0])

    # splitting date column
    hourly_hist_forecast = hourly_hist_forecast.withColumn('date', split(col('date'), '-'))
    hourly_hist_forecast = hourly_hist_forecast.withColumn('year', col('date')[0]).withColumn('month', col('date')[1]).withColumn('day', col('date')[2])
    hourly_hist_forecast = hourly_hist_forecast.withColumn('year', col('year').cast('int')).withColumn('month', col('month').cast('int')).withColumn('day', col('day').cast('int'))

    # dropping unnecessary columns
    hourly_hist_forecast = hourly_hist_forecast.drop('date', 'timezone', 'wind_direction_180m', 'wind_speed_180m', 'weather_code', 'snow_depth')
    
    # writing to silver updates folder
    hourly_hist_forecast.write.mode('overwrite').partitionBy('year', 'month').parquet('/mnt/de-upskilling-weather/Silver/Updates/hourly_historical_forecast_update.parquet/')
    
    print('done')
else:
    print('hourly_historical_forecast_updates.parquet not detected')

done


In [0]:
if 'daily_historical_updates.parquet' in parquet_files:
    daily_hist = spark.read.parquet('/mnt/de-upskilling-weather/LandingZone/Updates/daily_historical_updates.parquet/')
    
    # dropping null values 
    daily_hist = daily_hist.dropna()

    # removing timestamp
    daily_hist = daily_hist.withColumn('date', split(col('date'), ' ')).withColumn('date', col('date')[0])

    # splitting date column
    daily_hist = daily_hist.withColumn('date', split(col('date'), '-'))
    daily_hist = daily_hist.withColumn('year', col('date')[0]).withColumn('month', col('date')[1]).withColumn('day', col('date')[2])
    daily_hist = daily_hist.withColumn('year', col('year').cast('int')).withColumn('month', col('month').cast('int')).withColumn('day', col('day').cast('int'))

    # dropping columns
    daily_hist = daily_hist.drop('timezone', 'date')
    
    daily_hist.write.mode('overwrite').partitionBy('year').parquet('/mnt/de-upskilling-weather/Silver/Updates/daily_historical_update.parquet/')

    print('done')
else:
    print('daily_historical_updates.parquet not detected')

done


In [0]:
if 'hourly_historical_updates.parquet' in parquet_files:
    hourly_hist = spark.read.parquet('/mnt/de-upskilling-weather/LandingZone/Updates/hourly_historical_updates.parquet/')

    hourly_hist = hourly_hist.drop('snow_depth', 'weather_code')
    hourly_hist = hourly_hist.dropna()

    # splitting timestamp and date
    hourly_hist = hourly_hist.withColumn('date', split(col('date'), ' ')).withColumn('time', col('date')[1]).withColumn('date', col('date')[0])

    # splitting date column
    hourly_hist = hourly_hist.withColumn('date', split(col('date'), '-'))
    hourly_hist = hourly_hist.withColumn('year', col('date')[0]).withColumn('month', col('date')[1]).withColumn('day', col('date')[2])
    hourly_hist = hourly_hist.withColumn('year', col('year').cast('int')).withColumn('month', col('month').cast('int')).withColumn('day', col('day').cast('int'))

    # dropping unnecessary columns
    hourly_hist = hourly_hist.drop('date', 'timezone')
    hourly_hist = hourly_hist.dropna()
    
    hourly_hist.write.mode('overwrite').partitionBy('year', 'month').parquet('/mnt/de-upskilling-weather/Silver/Updates/hourly_historical_update.parquet/')
    
    print('done')
else:
    print('hourly_historical_updates.parquet not detected')

done
