In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

Collecting openmeteo-requests
  Obtaining dependency information for openmeteo-requests from https://files.pythonhosted.org/packages/13/31/96209383687cf35055eb628e3a9207a07ac2d5faf6e70076f459435a989e/openmeteo_requests-1.3.0-py3-none-any.whl.metadata
  Downloading openmeteo_requests-1.3.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Obtaining dependency information for openmeteo-sdk>=1.4.0 from https://files.pythonhosted.org/packages/18/9a/f33c4eb783d505d0099c039bbac30da09266027d9e3e0b5de76ef796749d/openmeteo_sdk-1.18.0-py3-none-any.whl.metadata
  Downloading openmeteo_sdk-1.18.0-py3-none-any.whl.metadata (934 bytes)
Collecting flatbuffers>=24.0.0 (from openmeteo-sdk>=1.4.0->openmeteo-requests)
  Obtaining dependency information for flatbuffers>=24.0.0 from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata
  Downloading flatbuffers

In [0]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"

latitudes = [38.855584, 40.724039, 25.695215, 34.005712, 32.724548, 47.594472, 39.676938, 39.952583, 39.791, 35.481918, 36.174465, 41.881832, 51.509865, 52.520008, 48.137154, 48.864716, 47.373878, 51.260197, 52.377956, 53.35014, 40.416775, 48.210033, 46.056946, 50.073658, 55.676098]
longitudes = [-77.036975, -73.994982, -80.168933, -118.175596, -96.76923, -122.348286, -104.977053, -75.165222, -86.148003, -97.508469, -86.76796, -87.623177, -0.118092, 13.404954, 11.576124, 2.349014, 8.545094, 4.402771, 4.89707, -6.266155, -3.70379, 16.363449, 14.505751, 14.41854, 12.568337]
timezones = ["America/New_York", "America/New_York", "America/New_York", "America/Los_Angeles", "America/Chicago", "America/Los_Angeles", "America/Denver", "America/New_York", "America/Indiana/Indianapolis", "America/Chicago", "America/Chicago", "America/Chicago", "Europe/London", "Europe/Berlin", "Europe/Berlin", "Europe/Paris", "Europe/Zurich", "Europe/Brussels", "Europe/Amsterdam", "Europe/Dublin", "Europe/Madrid", "Europe/Vienna", "Europe/Ljubljana", "Europe/Prague", "Europe/Copenhagen"]

params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_max", "apparent_temperature_min", "precipitation_sum", "rain_sum", "showers_sum", "snowfall_sum", "wind_speed_10m_max", "wind_gusts_10m_max", "wind_direction_10m_dominant"],
	"timezone": 'GMT'
}
responses = openmeteo.weather_api(url, params=params)


In [0]:
for i, response in enumerate(responses):
    
    #print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    #print(f"Elevation {response.Elevation()} m asl")
    #print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    #print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_weather_code = daily.Variables(0).ValuesAsNumpy()
    daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
    daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()
    daily_apparent_temperature_max = daily.Variables(3).ValuesAsNumpy()
    daily_apparent_temperature_min = daily.Variables(4).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(5).ValuesAsNumpy()
    daily_rain_sum = daily.Variables(6).ValuesAsNumpy()
    daily_showers_sum = daily.Variables(7).ValuesAsNumpy()
    daily_snowfall_sum = daily.Variables(8).ValuesAsNumpy()
    daily_wind_speed_10m_max = daily.Variables(9).ValuesAsNumpy()
    daily_wind_gusts_10m_max = daily.Variables(10).ValuesAsNumpy()
    daily_wind_direction_10m_dominant = daily.Variables(11).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}
    
    daily_data["latitude"] = latitudes[i]
    daily_data["longitude"] = longitudes[i]
    daily_data["timezone"] = timezones[i]
    
    daily_data["weather_code"] = daily_weather_code
    daily_data["temperature_2m_max"] = daily_temperature_2m_max
    daily_data["temperature_2m_min"] = daily_temperature_2m_min
    daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
    daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
    daily_data["precipitation_sum"] = daily_precipitation_sum
    daily_data["rain_sum"] = daily_rain_sum
    daily_data["showers_sum"] = daily_showers_sum
    daily_data["snowfall_sum"] = daily_snowfall_sum
    daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
    daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
    daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant

    daily_dataframe = pd.DataFrame(data = daily_data)
    #print(daily_dataframe)
    
    
    
    if i == 0:
        # Convert `daily_dataframe` to a Spark DataFrame
        spark_df = spark.createDataFrame(daily_dataframe)
    else:
        # Convert `daily_dataframe` to a Spark DataFrame
        daily_spark_df = spark.createDataFrame(daily_dataframe)

        # Concatenate the two Spark DataFrames
        spark_df = spark_df.unionByName(daily_spark_df)
    
    
    #print(hourly_dataframe)


In [0]:
spark_df.display()  # Displays the Spark DataFrame

date,latitude,longitude,timezone,weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,showers_sum,snowfall_sum,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant
2024-12-19T00:00:00Z,38.855584,-77.036975,America/New_York,53.0,11.737000465393066,4.586999893188477,10.008292198181152,1.4046697616577148,1.2000000476837158,1.2000000476837158,0.0,0.0,17.057313919067383,49.68000030517578,324.9458923339844
2024-12-20T00:00:00Z,38.855584,-77.036975,America/New_York,53.0,4.787000179290772,-2.263000011444092,1.4157962799072266,-5.935993194580078,2.299999952316284,2.299999952316284,0.0,0.0,11.019763946533203,28.07999992370605,4.476046085357666
2024-12-21T00:00:00Z,38.855584,-77.036975,America/New_York,3.0,3.275000095367432,-0.5130000114440918,0.3316249847412109,-6.474255084991455,0.0,0.0,0.0,0.0,28.116899490356445,53.27999877929688,316.7257080078125
2024-12-22T00:00:00Z,38.855584,-77.036975,America/New_York,1.0,0.625,-3.725000143051148,-5.082649230957031,-9.19597339630127,0.0,0.0,0.0,0.0,16.17998504638672,39.59999847412109,339.9588623046875
2024-12-23T00:00:00Z,38.855584,-77.036975,America/New_York,0.0,3.325000047683716,-2.674999952316284,-1.689753770828247,-7.108798980712891,0.0,0.0,0.0,0.0,11.304228782653809,27.0,145.08541870117188
2024-12-24T00:00:00Z,38.855584,-77.036975,America/New_York,3.0,3.674999952316284,-0.1249999701976776,1.248969554901123,-4.780267715454102,0.0,0.0,0.0,0.0,12.72792148590088,48.23999786376953,198.9967498779297
2024-12-25T00:00:00Z,38.855584,-77.036975,America/New_York,3.0,6.674999713897705,3.825000047683716,2.7889280319213867,0.14286470413208,0.0,0.0,0.0,0.0,10.85386562347412,31.319997787475582,346.9784851074219
2024-12-19T00:00:00Z,40.724039,-73.994982,America/New_York,55.0,8.657000541687012,2.2069997787475586,6.619444847106934,-1.5902364253997805,4.699999809265137,4.699999809265137,0.0,0.0,23.565568923950195,50.7599983215332,313.028076171875
2024-12-20T00:00:00Z,40.724039,-73.994982,America/New_York,71.0,4.056999683380127,-0.7929999828338623,-0.2803986072540283,-4.2444353103637695,3.3000001907348637,3.200000047683716,0.0,0.0700000002980232,17.33989715576172,31.319997787475582,9.894817352294922
2024-12-21T00:00:00Z,40.724039,-73.994982,America/New_York,73.0,2.2069997787475586,-3.7930002212524414,-1.8161237239837649,-10.28304386138916,2.8000001907348637,1.2000000476837158,0.0,1.1200000047683716,30.02710723876953,47.519996643066406,332.465576171875


In [0]:
spark_df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- weather_code: double (nullable = true)
 |-- temperature_2m_max: double (nullable = true)
 |-- temperature_2m_min: double (nullable = true)
 |-- apparent_temperature_max: double (nullable = true)
 |-- apparent_temperature_min: double (nullable = true)
 |-- precipitation_sum: double (nullable = true)
 |-- rain_sum: double (nullable = true)
 |-- showers_sum: double (nullable = true)
 |-- snowfall_sum: double (nullable = true)
 |-- wind_speed_10m_max: double (nullable = true)
 |-- wind_gusts_10m_max: double (nullable = true)
 |-- wind_direction_10m_dominant: double (nullable = true)



In [0]:
spark_df.count()

175

In [0]:
dbutils.fs.ls("/mnt/de-upskilling-weather/Silver")

[FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/7day_daily_forecast.parquet/', name='7day_daily_forecast.parquet/', size=0, modificationTime=1733329061000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/7day_hourly_forecast.parquet/', name='7day_hourly_forecast.parquet/', size=0, modificationTime=1733329061000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/Forecast_Data/', name='Forecast_Data/', size=0, modificationTime=1733329061000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/Forecasts/', name='Forecasts/', size=0, modificationTime=1733329061000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/Updates/', name='Updates/', size=0, modificationTime=1733329062000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/cities_dim.parquet/', name='cities_dim.parquet/', size=0, modificationTime=1733329063000),
 FileInfo(path='dbfs:/mnt/de-upskilling-weather/Silver/daily_historical.parquet/', name='daily_historical.parquet/', size=0, modificatio

In [0]:
df_cities = spark.read.parquet("/mnt/de-upskilling-weather/Silver/cities_dim.parquet")

df_cities.show(10)

+------------+--------------+---------+-----------+---------+
|        city|       country| latitude|  longitude|time_zone|
+------------+--------------+---------+-----------+---------+
|      Vienna|       Austria|48.210033|  16.363449|    +1:00|
|   Ljubljana|      Slovenia|46.056946|  14.505751|    +1:00|
|      Denver| United States|39.676938|-104.977053|    -6:00|
|       Paris|        France|48.864716|   2.349014|    +1:00|
|      Zurich|   Switzerland|47.373878|   8.545094|    +1:00|
|      London|United Kingdom|51.509865|  -0.118092|    +0:00|
|      Berlin|       Germany|52.520008|  13.404954|    +1:00|
|Philadelphia| United States|39.952583| -75.165222|    -4:00|
|Indianapolis| United States|   39.791| -86.148003|    -4:00|
|   Nashville| United States|36.174465|  -86.76796|    -5:00|
+------------+--------------+---------+-----------+---------+
only showing top 10 rows



In [0]:
spark_df = spark_df.join(df_cities, on=['latitude', 'longitude'])
    
spark_df.display(10)

latitude,longitude,date,timezone,weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,showers_sum,snowfall_sum,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,city,country,time_zone
38.855584,-77.036975,2024-12-19T00:00:00Z,America/New_York,53.0,11.737000465393066,4.586999893188477,10.008292198181152,1.4046697616577148,1.2000000476837158,1.2000000476837158,0.0,0.0,17.057313919067383,49.68000030517578,324.9458923339844,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-20T00:00:00Z,America/New_York,53.0,4.787000179290772,-2.263000011444092,1.4157962799072266,-5.935993194580078,2.299999952316284,2.299999952316284,0.0,0.0,11.019763946533203,28.07999992370605,4.476046085357666,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-21T00:00:00Z,America/New_York,3.0,3.275000095367432,-0.5130000114440918,0.3316249847412109,-6.474255084991455,0.0,0.0,0.0,0.0,28.116899490356445,53.27999877929688,316.7257080078125,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-22T00:00:00Z,America/New_York,1.0,0.625,-3.725000143051148,-5.082649230957031,-9.19597339630127,0.0,0.0,0.0,0.0,16.17998504638672,39.59999847412109,339.9588623046875,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-23T00:00:00Z,America/New_York,0.0,3.325000047683716,-2.674999952316284,-1.689753770828247,-7.108798980712891,0.0,0.0,0.0,0.0,11.304228782653809,27.0,145.08541870117188,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-24T00:00:00Z,America/New_York,3.0,3.674999952316284,-0.1249999701976776,1.248969554901123,-4.780267715454102,0.0,0.0,0.0,0.0,12.72792148590088,48.23999786376953,198.9967498779297,"Washington, DC",United States,-4:00
38.855584,-77.036975,2024-12-25T00:00:00Z,America/New_York,3.0,6.674999713897705,3.825000047683716,2.7889280319213867,0.14286470413208,0.0,0.0,0.0,0.0,10.85386562347412,31.319997787475582,346.9784851074219,"Washington, DC",United States,-4:00
40.724039,-73.994982,2024-12-19T00:00:00Z,America/New_York,55.0,8.657000541687012,2.2069997787475586,6.619444847106934,-1.5902364253997805,4.699999809265137,4.699999809265137,0.0,0.0,23.565568923950195,50.7599983215332,313.028076171875,New York,United States,-4:00
40.724039,-73.994982,2024-12-20T00:00:00Z,America/New_York,71.0,4.056999683380127,-0.7929999828338623,-0.2803986072540283,-4.2444353103637695,3.3000001907348637,3.200000047683716,0.0,0.0700000002980232,17.33989715576172,31.319997787475582,9.894817352294922,New York,United States,-4:00
40.724039,-73.994982,2024-12-21T00:00:00Z,America/New_York,73.0,2.2069997787475586,-3.7930002212524414,-1.8161237239837649,-10.28304386138916,2.8000001907348637,1.2000000476837158,0.0,1.1200000047683716,30.02710723876953,47.519996643066406,332.465576171875,New York,United States,-4:00


In [0]:
spark_df.write.mode("overwrite").parquet("/mnt/de-upskilling-weather/Gold/7day_daily_forecast.parquet")