In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas


Collecting openmeteo-requests
  Obtaining dependency information for openmeteo-requests from https://files.pythonhosted.org/packages/13/31/96209383687cf35055eb628e3a9207a07ac2d5faf6e70076f459435a989e/openmeteo_requests-1.3.0-py3-none-any.whl.metadata
  Downloading openmeteo_requests-1.3.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Obtaining dependency information for openmeteo-sdk>=1.4.0 from https://files.pythonhosted.org/packages/18/9a/f33c4eb783d505d0099c039bbac30da09266027d9e3e0b5de76ef796749d/openmeteo_sdk-1.18.0-py3-none-any.whl.metadata
  Downloading openmeteo_sdk-1.18.0-py3-none-any.whl.metadata (934 bytes)
Collecting flatbuffers>=24.0.0 (from openmeteo-sdk>=1.4.0->openmeteo-requests)
  Obtaining dependency information for flatbuffers>=24.0.0 from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata
  Downloading flatbuffers

In [0]:
import openmeteo_requests
from datetime import datetime, timedelta
import requests_cache
import pandas as pd
from retry_requests import retry
from pyspark.sql.functions import col, max as spark_max
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F

In [0]:
# Define S3 bucket details and file path
bucket_name = "de-upskilling-weather"
folder = "LandingZone"
file_name = "hourly_historical_forecast.parquet"
s3_path = f"/mnt/{bucket_name}/{folder}/{file_name}"

# Default start date
start_date = "2022-01-01"

end_date = "2024-09-01"#(datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
    
print("Ingestion will start from:", start_date)

print("End date is set to:", end_date)

Ingestion will start from: 2022-01-01
End date is set to: 2024-09-01


In [0]:
# Initialize cache and retry session
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Define the date ranges
start_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date_ = datetime.strptime(end_date, "%Y-%m-%d")

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"

# Define location and timezone data
latitudes = [38.855584, 40.724039, 25.695215, 34.005712, 32.724548, 47.594472, 39.676938, 39.952583, 39.791, 35.481918, 36.174465, 41.881832, 51.509865, 52.520008, 48.137154, 48.864716, 47.373878, 51.260197, 52.377956, 53.35014, 40.416775, 48.210033, 46.056946, 50.073658, 55.676098]
longitudes = [-77.036975, -73.994982, -80.168933, -118.175596, -96.76923, -122.348286, -104.977053, -75.165222, -86.148003, -97.508469, -86.76796, -87.623177, -0.118092, 13.404954, 11.576124, 2.349014, 8.545094, 4.402771, 4.89707, -6.266155, -3.70379, 16.363449, 14.505751, 14.41854, 12.568337]
timezones = ["America/New_York", "America/New_York", "America/New_York", "America/Los_Angeles", "America/Chicago", "America/Los_Angeles", "America/Denver", "America/New_York", "America/Indiana/Indianapolis", "America/Chicago", "America/Chicago", "America/Chicago", "Europe/London", "Europe/Berlin", "Europe/Berlin", "Europe/Paris", "Europe/Zurich", "Europe/Brussels", "Europe/Amsterdam", "Europe/Dublin", "Europe/Madrid", "Europe/Vienna", "Europe/Ljubljana", "Europe/Prague", "Europe/Copenhagen"]

# Parameters for the API
params_template = {
    "latitude": latitudes,
    "longitude": longitudes,
    "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "rain", "showers", "snowfall", "snow_depth", "weather_code", "wind_speed_10m", "wind_speed_80m", "wind_speed_120m", "wind_speed_180m", "wind_direction_10m", "wind_direction_80m", "wind_direction_120m", "wind_direction_180m", "wind_gusts_10m"],
    "timezone": "GMT" #timezones
}

# Initialize responses list
responses = []

# Iterate over year-long periods
current_start_date = start_date
while current_start_date < end_date_:
    # Set the end date for the current year period or cap it at the final end date
    current_end_date = min(current_start_date.replace(year=current_start_date.year + 1) - timedelta(days=1), end_date_)
    
    # Update parameters with the current date range
    params = params_template.copy()
    params["start_date"] = current_start_date.strftime("%Y-%m-%d")
    params["end_date"] = current_end_date.strftime("%Y-%m-%d")
    
    # Make the API call and add the response to the list
    response = openmeteo.weather_api(url, params=params)
    responses.extend(response)
    
    # Move to the next period (start date of next year)
    current_start_date = current_end_date + timedelta(days=1)



TypeError: strptime() argument 1 must be str, not datetime.datetime


In [0]:
len(responses)

75


In [0]:
for i, response in enumerate(responses):
    # Process first location. Add a for-loop for multiple locations or weather models
    #response = responses[0]
    #print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    #print(f"Elevation {response.Elevation()} m asl")
    #print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    #print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_precipitation = hourly.Variables(2).ValuesAsNumpy()
    hourly_rain = hourly.Variables(3).ValuesAsNumpy()
    hourly_showers = hourly.Variables(4).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(5).ValuesAsNumpy()
    hourly_snow_depth = hourly.Variables(6).ValuesAsNumpy()
    hourly_weather_code = hourly.Variables(7).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(8).ValuesAsNumpy()
    hourly_wind_speed_80m = hourly.Variables(9).ValuesAsNumpy()
    hourly_wind_speed_120m = hourly.Variables(10).ValuesAsNumpy()
    hourly_wind_speed_180m = hourly.Variables(11).ValuesAsNumpy()
    hourly_wind_direction_10m = hourly.Variables(12).ValuesAsNumpy()
    hourly_wind_direction_80m = hourly.Variables(13).ValuesAsNumpy()
    hourly_wind_direction_120m = hourly.Variables(14).ValuesAsNumpy()
    hourly_wind_direction_180m = hourly.Variables(15).ValuesAsNumpy()
    hourly_wind_gusts_10m = hourly.Variables(16).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}
    
    hourly_data["latitude"] = latitudes[i%len(latitudes)]
    hourly_data["longitude"] = longitudes[i%len(latitudes)]
    hourly_data["timezone"] = timezones[i%len(latitudes)]
    
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["precipitation"] = hourly_precipitation
    hourly_data["rain"] = hourly_rain
    hourly_data["showers"] = hourly_showers
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["snow_depth"] = hourly_snow_depth
    hourly_data["weather_code"] = hourly_weather_code
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["wind_speed_80m"] = hourly_wind_speed_80m
    hourly_data["wind_speed_120m"] = hourly_wind_speed_120m
    hourly_data["wind_speed_180m"] = hourly_wind_speed_180m
    hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
    hourly_data["wind_direction_80m"] = hourly_wind_direction_80m
    hourly_data["wind_direction_120m"] = hourly_wind_direction_120m
    hourly_data["wind_direction_180m"] = hourly_wind_direction_180m
    hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    
    if i == 0:
        # Convert `daily_dataframe` to a Spark DataFrame
        spark_df = spark.createDataFrame(hourly_dataframe)
    else:
        # Convert `daily_dataframe` to a Spark DataFrame
        hourly_spark_df = spark.createDataFrame(hourly_dataframe)

        # Concatenate the two Spark DataFrames
        spark_df = spark_df.unionByName(hourly_spark_df)
    
    #print(hourly_dataframe)



In [0]:
#spark_df.show()

+-------------------+---------+----------+----------------+------------------+--------------------+-----------------+-----------------+-------+--------+----------+------------+------------------+------------------+------------------+---------------+------------------+------------------+-------------------+-------------------+------------------+
|               date| latitude| longitude|        timezone|    temperature_2m|relative_humidity_2m|    precipitation|             rain|showers|snowfall|snow_depth|weather_code|    wind_speed_10m|    wind_speed_80m|   wind_speed_120m|wind_speed_180m|wind_direction_10m|wind_direction_80m|wind_direction_120m|wind_direction_180m|    wind_gusts_10m|
+-------------------+---------+----------+----------------+------------------+--------------------+-----------------+-----------------+-------+--------+----------+------------+------------------+------------------+------------------+---------------+------------------+------------------+-------------------

In [0]:
spark_df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relative_humidity_2m: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- showers: double (nullable = true)
 |-- snowfall: double (nullable = true)
 |-- snow_depth: double (nullable = true)
 |-- weather_code: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)
 |-- wind_speed_80m: double (nullable = true)
 |-- wind_speed_120m: double (nullable = true)
 |-- wind_speed_180m: double (nullable = true)
 |-- wind_direction_10m: double (nullable = true)
 |-- wind_direction_80m: double (nullable = true)
 |-- wind_direction_120m: double (nullable = true)
 |-- wind_direction_180m: double (nullable = true)
 |-- wind_gusts_10m: double (nullable = true)


In [0]:
spark_df.count() #615600

585000


In [0]:
spark_df.write.mode("overwrite").parquet(s3_path)




In [0]:
from pyspark.sql.types import StructType, StructField, StringType
from datetime import datetime

# Get current date and time
now = datetime.now()

# Format date and time as a string
formatted_date_time = now.strftime("%Y-%m-%d %H:%M:%S")

# Sample data
data = [
    (file_name, end_date, formatted_date_time)
]

# Define schema
schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("latest_data", StringType(), True),
    StructField("date_updated", StringType(), True)
])

# Create DataFrame
df_log = spark.createDataFrame(data, schema)

# Show DataFrame
df_log.show()

+--------------------+-----------+-------------------+
|           file_name|latest_data|       date_updated|
+--------------------+-----------+-------------------+
|hourly_historical...| 2024-09-01|2024-12-10 16:09:03|
+--------------------+-----------+-------------------+



In [0]:
#615600
df_log.write.mode("append").parquet(f"/mnt/{bucket_name}/Gold/log.parquet")