# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [0]:
#%pip install openmeteo-requests
#%pip install requests-cache retry-requests numpy pandas



%timeout 120
%idle_timeout 15
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

%additional_python_modules openmeteo_requests, requests_cache, retry_requests

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#%stop_session

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current timeout is None minutes.
timeout has been set to 120 minutes.
Current idle_timeout is None minutes.
idle_timeout has been set to 15 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Additional python modules to be included:
openmeteo_requests
requests_cache
retry_requests
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 15
Timeout: 120
Session ID: 4345512e-643d-40bf-ab61-f6fae508d459
Applying the following default arguments:
--glue_kernel_version

In [0]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

from datetime import datetime, timedelta
from pyspark.sql.functions import col, max as spark_max
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F




In [0]:
# Define S3 bucket details and file path
bucket_name = "de-upskill-weatherforecasting"
folder = "LandingZone"
file_name = "hourly_historical.parquet"
s3_path = f"s3://{bucket_name}/{folder}/{file_name}"

# Default start date
default_start_date = "2010-01-01"


# Try to load the existing data from S3
df_update = spark.read.parquet(f"s3://{bucket_name}/Gold/last_updated.parquet")
df_update.cache()

# Get the day data was lasr received
last_updated = df_update.filter(df_update.file_name == file_name).select('last_updated').collect()[0][0]


# Convert last_updated to a datetime object
last_updated = datetime.strptime(last_updated, "%Y-%m-%d")
# Set start_date to one day after the last date
start_date = (last_updated + timedelta(days=1)).strftime("%Y-%m-%d")

end_date = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
    
print("Ingestion will start from:", start_date)

print("End date is set to:", end_date)

Ingestion will start from: 2010-01-01
End date is set to: 2024-11-10


In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"

latitudes = [38.855584, 40.724039, 25.695215, 34.005712, 32.724548, 47.594472, 39.676938, 39.952583, 39.791, 35.481918, 36.174465, 41.881832, 51.509865, 52.520008, 48.137154, 48.864716, 47.373878, 51.260197, 52.377956, 53.35014, 40.416775, 48.210033, 46.056946, 50.073658, 55.676098]
longitudes = [-77.036975, -73.994982, -80.168933, -118.175596, -96.76923, -122.348286, -104.977053, -75.165222, -86.148003, -97.508469, -86.76796, -87.623177, -0.118092, 13.404954, 11.576124, 2.349014, 8.545094, 4.402771, 4.89707, -6.266155, -3.70379, 16.363449, 14.505751, 14.41854, 12.568337]
timezones = ["America/New_York", "America/New_York", "America/New_York", "America/Los_Angeles", "America/Chicago", "America/Los_Angeles", "America/Denver", "America/New_York", "America/Indiana/Indianapolis", "America/Chicago", "America/Chicago", "America/Chicago", "Europe/London", "Europe/Berlin", "Europe/Berlin", "Europe/Paris", "Europe/Zurich", "Europe/Brussels", "Europe/Amsterdam", "Europe/Dublin", "Europe/Madrid", "Europe/Vienna", "Europe/Ljubljana", "Europe/Prague", "Europe/Copenhagen"]

params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,
	"end_date": end_date,
	"hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m", "wind_direction_100m", "wind_gusts_10m"],
	"timezone": timezones
}
responses = openmeteo.weather_api(url, params=params)# Process first location. Add a for-loop for multiple locations or weather models




In [0]:
for i, response in enumerate(responses):

    #response = responses[0]
    #print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    #print(f"Elevation {response.Elevation()} m asl")
    #print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    #print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_precipitation = hourly.Variables(2).ValuesAsNumpy()
    hourly_rain = hourly.Variables(3).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
    hourly_snow_depth = hourly.Variables(5).ValuesAsNumpy()
    hourly_weather_code = hourly.Variables(6).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(7).ValuesAsNumpy()
    hourly_wind_speed_100m = hourly.Variables(8).ValuesAsNumpy()
    hourly_wind_direction_10m = hourly.Variables(9).ValuesAsNumpy()
    hourly_wind_direction_100m = hourly.Variables(10).ValuesAsNumpy()
    hourly_wind_gusts_10m = hourly.Variables(11).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}

    hourly_data["latitude"] = latitudes[i]
    hourly_data["longitude"] = longitudes[i]
    hourly_data["timezone"] = timezones[i]

    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["precipitation"] = hourly_precipitation
    hourly_data["rain"] = hourly_rain
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["snow_depth"] = hourly_snow_depth
    hourly_data["weather_code"] = hourly_weather_code
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
    hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
    hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
    hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    
    if i == 0:
        # Convert `daily_dataframe` to a Spark DataFrame
        spark_df = spark.createDataFrame(hourly_dataframe)
    else:
        # Convert `daily_dataframe` to a Spark DataFrame
        hourly_spark_df = spark.createDataFrame(hourly_dataframe)

        # Concatenate the two Spark DataFrames
        spark_df = spark_df.unionByName(hourly_spark_df)
    
    
    #print(hourly_dataframe)

Coordinates 38.84006881713867°N -77.09014892578125°E
Elevation 3.0 m asl
Timezone b'America/New_York' b'EST'
Timezone difference to GMT+0 -18000 s
Coordinates 40.738136291503906°N -74.04254150390625°E
Elevation 28.0 m asl
Timezone b'America/New_York' b'EST'
Timezone difference to GMT+0 -18000 s
Coordinates 25.694198608398438°N -80.3046875°E
Elevation 4.0 m asl
Timezone b'America/New_York' b'EST'
Timezone difference to GMT+0 -18000 s
Coordinates 33.98945236206055°N -118.20223999023438°E
Elevation 50.0 m asl
Timezone b'America/Los_Angeles' b'PST'
Timezone difference to GMT+0 -28800 s
Coordinates 32.72407531738281°N -96.81317138671875°E
Elevation 125.0 m asl
Timezone b'America/Chicago' b'CST'
Timezone difference to GMT+0 -21600 s
Coordinates 47.62741470336914°N -122.32290649414062°E
Elevation 0.0 m asl
Timezone b'America/Los_Angeles' b'PST'
Timezone difference to GMT+0 -28800 s
Coordinates 39.68365478515625°N -105.0°E
Elevation 1628.0 m asl
Timezone b'America/Denver' b'MST'
Timezone diffe

In [0]:
#129912 per location

In [0]:
spark_df.show()  # Displays the Spark DataFrame

In [0]:
spark_df.printSchema()

In [0]:
spark_df.count()

3256800


In [0]:
spark_df.write.mode("append").parquet(f"s3://{bucket_name}/LandingZone/hourly_historical.parquet")




In [0]:
# update the last_updated df
# Set 'last_updated' to null where file_name is 'file1.parquet'
df_update = df_update.withColumn("last_updated",when(col("file_name") == file_name, F.lit(end_date)).otherwise(col("last_updated")))
df_update.show()

+--------------------+------------+
|           file_name|last_updated|
+--------------------+------------+
|hourly_historical...|            |
|7day_daily_foreca...|            |
|daily_historical_...|            |
|7day_hourly_forec...|            |
|daily_historical....|            |
|hourly_historical...|  2024-11-10|
+--------------------+------------+


In [0]:
df_update.write.mode("overwrite").parquet(f"s3://{bucket_name}/Gold/last_updated.parquet")


