In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas


Collecting openmeteo-requests
  Obtaining dependency information for openmeteo-requests from https://files.pythonhosted.org/packages/13/31/96209383687cf35055eb628e3a9207a07ac2d5faf6e70076f459435a989e/openmeteo_requests-1.3.0-py3-none-any.whl.metadata
  Downloading openmeteo_requests-1.3.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Obtaining dependency information for openmeteo-sdk>=1.4.0 from https://files.pythonhosted.org/packages/18/9a/f33c4eb783d505d0099c039bbac30da09266027d9e3e0b5de76ef796749d/openmeteo_sdk-1.18.0-py3-none-any.whl.metadata
  Downloading openmeteo_sdk-1.18.0-py3-none-any.whl.metadata (934 bytes)
Collecting flatbuffers>=24.0.0 (from openmeteo-sdk>=1.4.0->openmeteo-requests)
  Obtaining dependency information for flatbuffers>=24.0.0 from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata
  Downloading flatbuffers

In [0]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime, timedelta
from pyspark.sql.functions import col, max as spark_max


In [0]:
# Define adls container details and file path
container_name = "de-upskilling-weather"
folder = "LandingZone"
file_name = "daily_historical.parquet"
adls_path = f"/mnt/{container_name}/{folder}/{file_name}"

# Default start date
default_start_date = "01-01-2010"

try:
    # Try to load the existing data from S3
    df_existing = spark.read.parquet(adls_path)
    
    # Get the maximum date in the DataFrame
    last_date_row = df_existing.select(spark_max(col("date")).alias("last_date")).collect()[0]
    last_date = last_date_row["last_date"]

    # Set start_date to one day after the last date
    start_date = (last_date + timedelta(days=1)).strftime("%d-%m-%Y")

except Exception as e:
    # If the file does not exist or there's an error, set start_date to the default
    print("Data file not found, starting from default date.")
    start_date = default_start_date

end_date = (datetime.today() - timedelta(days=1)).strftime("%d-%m-%Y")

# Convert to datetime object
start_date = datetime.strptime(start_date, "%d-%m-%Y")
# Format back to desired format
start_date = start_date.strftime("%Y-%m-%d") 


# Convert to datetime object
end_date = datetime.strptime(end_date, "%d-%m-%Y")
# Format back to desired format
end_date = end_date.strftime("%Y-%m-%d")
    
print("Ingestion will start from:", start_date)


print("End date is set to:", end_date)

Ingestion will start from: 2024-09-02
End date is set to: 2024-12-05


In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"

latitudes = [38.855584, 40.724039, 25.695215, 34.005712, 32.724548, 47.594472, 39.676938, 39.952583, 39.791, 35.481918, 36.174465, 41.881832, 51.509865, 52.520008, 48.137154, 48.864716, 47.373878, 51.260197, 52.377956, 53.35014, 40.416775, 48.210033, 46.056946, 50.073658, 55.676098]
longitudes = [-77.036975, -73.994982, -80.168933, -118.175596, -96.76923, -122.348286, -104.977053, -75.165222, -86.148003, -97.508469, -86.76796, -87.623177, -0.118092, 13.404954, 11.576124, 2.349014, 8.545094, 4.402771, 4.89707, -6.266155, -3.70379, 16.363449, 14.505751, 14.41854, 12.568337]
params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,#"2010-01-01",
	"end_date": end_date,#"2024-10-22",
	"daily": ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean", "apparent_temperature_max", "apparent_temperature_min", "apparent_temperature_mean", "precipitation_sum", "rain_sum", "snowfall_sum", "wind_speed_10m_max", "wind_gusts_10m_max", "wind_direction_10m_dominant"]
}
responses = openmeteo.weather_api(url, params=params)


[0;31m---------------------------------------------------------------------------[0m
[0;31mOpenMeteoRequestsError[0m                    Traceback (most recent call last)
File [0;32m<command-2672634603601895>, line 19[0m
[1;32m     11[0m longitudes [38;5;241m=[39m [[38;5;241m-[39m[38;5;241m77.036975[39m, [38;5;241m-[39m[38;5;241m73.994982[39m, [38;5;241m-[39m[38;5;241m80.168933[39m, [38;5;241m-[39m[38;5;241m118.175596[39m, [38;5;241m-[39m[38;5;241m96.76923[39m, [38;5;241m-[39m[38;5;241m122.348286[39m, [38;5;241m-[39m[38;5;241m104.977053[39m, [38;5;241m-[39m[38;5;241m75.165222[39m, [38;5;241m-[39m[38;5;241m86.148003[39m, [38;5;241m-[39m[38;5;241m97.508469[39m, [38;5;241m-[39m[38;5;241m86.76796[39m, [38;5;241m-[39m[38;5;241m87.623177[39m, [38;5;241m-[39m[38;5;241m0.118092[39m, [38;5;241m13.404954[39m, [38;5;241m11.576124[39m, [38;5;241m2.349014[39m, [38;5;241m8.545094[39m, [38;5;241m4.402771[39m, [38;5;241m4.89707[3

In [0]:
# Process first location. Add a for-loop for multiple locations or weather models




for i, response in enumerate(responses):

    #response = responses[0]
    #print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    #print(f"Elevation {response.Elevation()} m asl")
    #print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    #print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_temperature_2m_max = daily.Variables(0).ValuesAsNumpy()
    daily_temperature_2m_min = daily.Variables(1).ValuesAsNumpy()
    daily_temperature_2m_mean = daily.Variables(2).ValuesAsNumpy()
    daily_apparent_temperature_max = daily.Variables(3).ValuesAsNumpy()
    daily_apparent_temperature_min = daily.Variables(4).ValuesAsNumpy()
    daily_apparent_temperature_mean = daily.Variables(5).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(6).ValuesAsNumpy()
    daily_rain_sum = daily.Variables(7).ValuesAsNumpy()
    daily_snowfall_sum = daily.Variables(8).ValuesAsNumpy()
    daily_wind_speed_10m_max = daily.Variables(9).ValuesAsNumpy()
    daily_wind_gusts_10m_max = daily.Variables(10).ValuesAsNumpy()
    daily_wind_direction_10m_dominant = daily.Variables(11).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}
    
    daily_data["latitute"] = latitudes[i]
    daily_data["longitude"] = longitudes[i]
    
    daily_data["temperature_2m_max"] = daily_temperature_2m_max
    daily_data["temperature_2m_min"] = daily_temperature_2m_min
    daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
    daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
    daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
    daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
    daily_data["precipitation_sum"] = daily_precipitation_sum
    daily_data["rain_sum"] = daily_rain_sum
    daily_data["snowfall_sum"] = daily_snowfall_sum
    daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
    daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
    daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant

    daily_dataframe = pd.DataFrame(data = daily_data)
    
    if i == 0:
        spark_df = spark.createDataFrame(daily_dataframe)
    else:
        # Convert `daily_dataframe` to a Spark DataFrame
        daily_spark_df = spark.createDataFrame(daily_dataframe)

        # Step 2: Concatenate the two Spark DataFrames
        spark_df = spark_df.unionByName(daily_spark_df)
    
    #print(daily_dataframe)

Coordinates 38.84006881713867°N -77.09014892578125°E
Elevation 3.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 40.738136291503906°N -74.04254150390625°E
Elevation 28.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 25.694198608398438°N -80.3046875°E
Elevation 4.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 33.98945236206055°N -118.20223999023438°E
Elevation 50.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 32.72407531738281°N -96.81317138671875°E
Elevation 125.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 47.62741470336914°N -122.32290649414062°E
Elevation 0.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 39.68365478515625°N -105.0°E
Elevation 1628.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 39.964847564697266°N -75.1676025390625°E
Elevation 32.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordin

In [0]:
#spark_df = spark.createDataFrame(daily_dataframe)
spark_df.show()  # Displays the Spark DataFrame


+-------------------+---------+----------+------------------+------------------+-------------------+------------------------+------------------------+-------------------------+------------------+------------------+------------+------------------+------------------+---------------------------+
|               date| latitute| longitude|temperature_2m_max|temperature_2m_min|temperature_2m_mean|apparent_temperature_max|apparent_temperature_min|apparent_temperature_mean| precipitation_sum|          rain_sum|snowfall_sum|wind_speed_10m_max|wind_gusts_10m_max|wind_direction_10m_dominant|
+-------------------+---------+----------+------------------+------------------+-------------------+------------------------+------------------------+-------------------------+------------------+------------------+------------+------------------+------------------+---------------------------+
|2024-10-23 00:00:00|38.855584|-77.036975|24.342498779296875| 9.242500305175781| 15.765416145324707|      23.843990325

In [0]:
spark_df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- latitute: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- temperature_2m_max: double (nullable = true)
 |-- temperature_2m_min: double (nullable = true)
 |-- temperature_2m_mean: double (nullable = true)
 |-- apparent_temperature_max: double (nullable = true)
 |-- apparent_temperature_min: double (nullable = true)
 |-- apparent_temperature_mean: double (nullable = true)
 |-- precipitation_sum: double (nullable = true)
 |-- rain_sum: double (nullable = true)
 |-- snowfall_sum: double (nullable = true)
 |-- wind_speed_10m_max: double (nullable = true)
 |-- wind_gusts_10m_max: double (nullable = true)
 |-- wind_direction_10m_dominant: double (nullable = true)


In [0]:
spark_df.count()

350


In [0]:
# Write the DataFrame to an S3 bucket in Parquet format
spark_df.write.mode("append").parquet(adls_path)





In [0]:
#5409 per location