In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas


Collecting openmeteo-requests
  Obtaining dependency information for openmeteo-requests from https://files.pythonhosted.org/packages/13/31/96209383687cf35055eb628e3a9207a07ac2d5faf6e70076f459435a989e/openmeteo_requests-1.3.0-py3-none-any.whl.metadata
  Downloading openmeteo_requests-1.3.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Obtaining dependency information for openmeteo-sdk>=1.4.0 from https://files.pythonhosted.org/packages/18/9a/f33c4eb783d505d0099c039bbac30da09266027d9e3e0b5de76ef796749d/openmeteo_sdk-1.18.0-py3-none-any.whl.metadata
  Downloading openmeteo_sdk-1.18.0-py3-none-any.whl.metadata (934 bytes)
Collecting flatbuffers>=24.0.0 (from openmeteo-sdk>=1.4.0->openmeteo-requests)
  Obtaining dependency information for flatbuffers>=24.0.0 from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata
  Downloading flatbuffers

In [0]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

from datetime import datetime, timedelta
from pyspark.sql.functions import col, max as spark_max
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F

In [0]:
# Define S3 bucket details and file path
bucket_name = "de-upskilling-weather"
folder = "LandingZone"
file_name = "daily_historical_forecast_updates.parquet"
org_file_name = "daily_historical_forecast.parquet"
s3_path = f"/mnt/{bucket_name}/{folder}/Updates/{file_name}"


df_update = spark.read.parquet(f"/mnt/{bucket_name}/Gold/log.parquet")

# Convert 'datetime' column to timestamp type
df_update = df_update.withColumn("latest_data", F.to_timestamp("latest_data"))

# Specify the name you are filtering for
#specific_name = "daily_historical_forcast.parquet"

# Filter by name and select the row with the maximum datetime
last_updated = df_update.filter(F.col("file_name") == org_file_name).orderBy(F.col("latest_data").desc()).limit(1).select('latest_data').collect()[0][0].date()

# Convert last_updated to a datetime object
#last_updated = datetime.strptime(last_updated, "%Y-%m-%d")
# Set start_date to one day after the last date
start_date = "2024-09-01"#(last_updated + timedelta(days=1)).strftime("%Y-%m-%d")

end_date = (datetime.today() - timedelta(days=2)).strftime("%Y-%m-%d")
    
print("Ingestion will start from:", start_date)

print("End date is set to:", end_date)

Ingestion will start from: 2024-12-16
End date is set to: 2024-12-15


In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"

latitudes = [38.855584, 40.724039, 25.695215, 34.005712, 32.724548, 47.594472, 39.676938, 39.952583, 39.791, 35.481918, 36.174465, 41.881832, 51.509865, 52.520008, 48.137154, 48.864716, 47.373878, 51.260197, 52.377956, 53.35014, 40.416775, 48.210033, 46.056946, 50.073658, 55.676098]
longitudes = [-77.036975, -73.994982, -80.168933, -118.175596, -96.76923, -122.348286, -104.977053, -75.165222, -86.148003, -97.508469, -86.76796, -87.623177, -0.118092, 13.404954, 11.576124, 2.349014, 8.545094, 4.402771, 4.89707, -6.266155, -3.70379, 16.363449, 14.505751, 14.41854, 12.568337]
timezones = ["America/New_York", "America/New_York", "America/New_York", "America/Los_Angeles", "America/Chicago", "America/Los_Angeles", "America/Denver", "America/New_York", "America/Indiana/Indianapolis", "America/Chicago", "America/Chicago", "America/Chicago", "Europe/London", "Europe/Berlin", "Europe/Berlin", "Europe/Paris", "Europe/Zurich", "Europe/Brussels", "Europe/Amsterdam", "Europe/Dublin", "Europe/Madrid", "Europe/Vienna", "Europe/Ljubljana", "Europe/Prague", "Europe/Copenhagen"]

params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,
	"end_date": end_date,
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_max", "apparent_temperature_min", "precipitation_sum", "rain_sum", "showers_sum", "snowfall_sum", "wind_speed_10m_max", "wind_gusts_10m_max", "wind_direction_10m_dominant"],
	"timezone": "GMT" #timezones
}
responses = openmeteo.weather_api(url, params=params)

In [0]:
for i, response in enumerate(responses):

    #response = responses[0]
    #print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    #print(f"Elevation {response.Elevation()} m asl")
    #print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    #print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_weather_code = daily.Variables(0).ValuesAsNumpy()
    daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
    daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()
    daily_apparent_temperature_max = daily.Variables(3).ValuesAsNumpy()
    daily_apparent_temperature_min = daily.Variables(4).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(5).ValuesAsNumpy()
    daily_rain_sum = daily.Variables(6).ValuesAsNumpy()
    daily_showers_sum = daily.Variables(7).ValuesAsNumpy()
    daily_snowfall_sum = daily.Variables(8).ValuesAsNumpy()
    daily_wind_speed_10m_max = daily.Variables(9).ValuesAsNumpy()
    daily_wind_gusts_10m_max = daily.Variables(10).ValuesAsNumpy()
    daily_wind_direction_10m_dominant = daily.Variables(11).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}
    
    daily_data["latitude"] = latitudes[i]
    daily_data["longitude"] = longitudes[i]
    daily_data["timezone"] = timezones[i]
    
    
    daily_data["weather_code"] = daily_weather_code
    daily_data["temperature_2m_max"] = daily_temperature_2m_max
    daily_data["temperature_2m_min"] = daily_temperature_2m_min
    daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
    daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
    daily_data["precipitation_sum"] = daily_precipitation_sum
    daily_data["rain_sum"] = daily_rain_sum
    daily_data["showers_sum"] = daily_showers_sum
    daily_data["snowfall_sum"] = daily_snowfall_sum
    daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
    daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
    daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant

    daily_dataframe = pd.DataFrame(data = daily_data)
    
    if i == 0:
        # Convert `daily_dataframe` to a Spark DataFrame
        spark_df = spark.createDataFrame(daily_dataframe)
    else:
        # Convert `daily_dataframe` to a Spark DataFrame
        daily_spark_df = spark.createDataFrame(daily_dataframe)

        # Concatenate the two Spark DataFrames
        spark_df = spark_df.unionByName(daily_spark_df)
    
    
    
    
    #print(daily_dataframe)

In [0]:
spark_df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- weather_code: double (nullable = true)
 |-- temperature_2m_max: double (nullable = true)
 |-- temperature_2m_min: double (nullable = true)
 |-- apparent_temperature_max: double (nullable = true)
 |-- apparent_temperature_min: double (nullable = true)
 |-- precipitation_sum: double (nullable = true)
 |-- rain_sum: double (nullable = true)
 |-- showers_sum: double (nullable = true)
 |-- snowfall_sum: double (nullable = true)
 |-- wind_speed_10m_max: double (nullable = true)
 |-- wind_gusts_10m_max: double (nullable = true)
 |-- wind_direction_10m_dominant: double (nullable = true)


In [0]:
spark_df.show()

+-------------------+---------+-----------+--------------------+------------+------------------+-------------------+------------------------+------------------------+-------------------+------------------+-------------------+------------+------------------+------------------+---------------------------+
|               date| latitude|  longitude|            timezone|weather_code|temperature_2m_max| temperature_2m_min|apparent_temperature_max|apparent_temperature_min|  precipitation_sum|          rain_sum|        showers_sum|snowfall_sum|wind_speed_10m_max|wind_gusts_10m_max|wind_direction_10m_dominant|
+-------------------+---------+-----------+--------------------+------------+------------------+-------------------+------------------------+------------------------+-------------------+------------------+-------------------+------------+------------------+------------------+---------------------------+
|2024-11-18 05:00:00|38.855584| -77.036975|    America/New_York|         3.0|22.13699

In [0]:
spark_df.count() #25125

2500

In [0]:
# Write the DataFrame to an S3 bucket in Parquet format
spark_df.write.mode("overwrite").parquet(s3_path)




In [0]:
from pyspark.sql.types import StructType, StructField, StringType
from datetime import datetime

# Get current date and time
now = datetime.now()

# Format date and time as a string
formatted_date_time = now.strftime("%Y-%m-%d %H:%M:%S")

# Sample data
data = [
    (org_file_name, end_date, formatted_date_time)
]

# Define schema
schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("latest_data", StringType(), True),
    StructField("date_updated", StringType(), True)
])

# Create DataFrame
df_log = spark.createDataFrame(data, schema)

# Show DataFrame
df_log.show()

+--------------------+-----------+-------------------+
|           file_name|latest_data|       date_updated|
+--------------------+-----------+-------------------+
|daily_historical_...| 2024-12-08|2024-12-10 16:13:48|
+--------------------+-----------+-------------------+



In [0]:
df_log.write.mode("append").parquet(f"/mnt/{bucket_name}/Gold/log.parquet")