In [0]:
from pyspark.sql.functions import col, lit, floor, when, concat_ws, to_timestamp, lpad, lag, lead, avg, hour, dayofyear
from pyspark.sql.window import Window
from pyspark.sql import Row
from datetime import datetime, timedelta
import pandas as pd
import re

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor

# Loading in data for cities that need GBT model
cities_list = ['Amsterdam', 'Copenhagen', 'Dublin', 'Antwerp']
api_forecasts_df = spark.read.parquet('/mnt/de-upskilling-weather/Gold/7day_hourly_forecast.parquet/').where(col('city').isin(cities_list)).select('date', 'city', 'temperature_2m', 'dew_point_2m', 'shortwave_radiation', 'surface_pressure', 'pressure_msl', 'cloud_cover', 'relative_humidity_2m').orderBy('city','date')
hourly_historical = spark.read.parquet('/mnt/de-upskilling-weather/Gold/hourly_historical.parquet/').where(col('city').isin(cities_list))

# Model generation preparation
feature_columns = ['temp_lag_1hr', 'rolling_temp_avg3hr', 'temp_lag_3hr', 'rolling_temp_avg6hr', 'dew_point_2m', 'season', 'shortwave_radiation', 'month', 'day_of_year', 'pressure_msl', 'surface_pressure', 'hour', 'cloud_cover', 'temp_lag_1day', 'rolling_temp_avg1day']

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
gbt = GBTRegressor(labelCol="temperature_2m", featuresCol="features", maxIter=150, maxDepth=7, stepSize=0.05)

# lag and rolling windows
lag_window = Window.orderBy('datetime')
rolling_window_3hr = Window.orderBy('datetime').rowsBetween(-3, -1)
rolling_window_6hr = Window.orderBy('datetime').rowsBetween(-6, -1)
rolling_window_12hr = Window.orderBy('datetime').rowsBetween(-12, -1)
rolling_window_24hr = Window.orderBy('datetime').rowsBetween(-24, -1) 

# season feature dictionary
month_to_season = {
    12: 0, 1: 0, 2: 0,  # Winter
    3: 1, 4: 1, 5: 1,  # Spring
    6: 2, 7: 2, 8: 2,  # Summer
    9: 3, 10: 3, 11: 3  # Fall
}

In [0]:
# Model Generation
for city in cities_list:

    city_no_ws = re.sub(r'\W+', '', city)
    model_path = f'/mnt/de-upskilling-weather/MachineLearning/{city_no_ws}_forecast_model'
    # getting dataset for specific city
    df = hourly_historical.where(col('city') == city)
    df = df.drop('city').orderBy('datetime')

    # temperature lags
    df = df.withColumn('temp_lag_1hr', lag('temperature_2m', 1).over(lag_window))
    df = df.withColumn('temp_lag_3hr', lag('temperature_2m', 3).over(lag_window))
    df = df.withColumn('temp_lag_1day', lag('temperature_2m', 24).over(lag_window))

    # temperature rolling averages
    df = df.withColumn('rolling_temp_avg3hr', avg('temperature_2m').over(rolling_window_3hr))
    df = df.withColumn('rolling_temp_avg6hr', avg('temperature_2m').over(rolling_window_6hr))
    df = df.withColumn('rolling_temp_avg1day', avg('temperature_2m').over(rolling_window_24hr))


    # dropping all nulls created from lags
    df = df.dropna()

    # vectorizing features, generating model, and saving model
    train_data = assembler.transform(df).select('features', 'temperature_2m')
    forecast_model = gbt.fit(train_data)

    forecast_model.write().overwrite().save(model_path)