In [0]:
from pyspark.sql.functions import col, lit, floor, when, concat_ws, to_timestamp, lpad, lag, lead, avg, hour, dayofyear
from pyspark.sql.window import Window
from pyspark.sql import Row
from datetime import datetime, timedelta
import pandas as pd
import re

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, GBTRegressor

## Data Preparation

In [0]:
cities_list = spark.read.parquet('/mnt/de-upskilling-weather/Silver/cities_dim.parquet/').select("city").rdd.flatMap(lambda x: x).collect()

hourly_historical = spark.read.parquet('/mnt/de-upskilling-weather/Gold/hourly_historical.parquet/')
hourly_historical = hourly_historical.select('city', 'year', 'month', hour('datetime').alias('hour'), 'datetime', 'temperature_2m', 'dew_point_2m', 'shortwave_radiation', 'surface_pressure', 'pressure_msl', 'cloud_cover')
hourly_historical = hourly_historical.withColumn('datetime', to_timestamp('datetime')).withColumn('day_of_year', dayofyear('datetime'))

In [0]:
# Season Feature
# Winter(0) = 12, 1, 2
# Spring(1) = 3, 4, 5
# Summer(2) = 6, 7, 8
# Fall(3) = 9, 10, 11
hourly_historical = hourly_historical.withColumn("season", when(col("month").isin([12, 1, 2]), 0)
                         .when(col("month").isin([3, 4, 5]), 1)
                         .when(col("month").isin([6, 7, 8]), 2)
                         .otherwise(3))

In [0]:
# vectorizing columns and linear regression preparation

feature_columns = ['temp_lag_1hr', 'rolling_temp_avg3hr', 'temp_lag_3hr', 'rolling_temp_avg6hr', 'dew_point_2m', 'season', 'shortwave_radiation', 'month', 'day_of_year', 'pressure_msl', 'surface_pressure', 'hour', 'cloud_cover', 'temp_lag_1day', 'rolling_temp_avg1day']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
lr = LinearRegression(featuresCol='features', labelCol='temperature_2m')
gbtr = GBTRegressor(labelCol="temperature_2m", featuresCol="features", maxIter=150, maxDepth=7, stepSize=0.05)

In [0]:
# windows for time series features
lag_window = Window.orderBy('datetime')
rolling_window_3hr = Window.orderBy('datetime').rowsBetween(-3, -1)
rolling_window_6hr = Window.orderBy('datetime').rowsBetween(-6, -1)
rolling_window_12hr = Window.orderBy('datetime').rowsBetween(-12, -1)
rolling_window_24hr = Window.orderBy('datetime').rowsBetween(-24, -1) 

In [0]:
# generating models for each city
for city in cities_list:
    print(f'working on {city}') 
    city_no_whitespace = re.sub(r'\W+', '', city)
    model_path = f'/mnt/de-upskilling-weather/MachineLearning/{city_no_whitespace}_forecast_model'
    forecast_data_path = f'/mnt/de-upskilling-weather/Silver/Forecast_Data/{city_no_whitespace}_forecast_data.parquet'

    # getting dataset for specific city
    df = hourly_historical.where(col('city') == city)
    df = df.drop('city').orderBy('datetime')

    # temperature lags
    df = df.withColumn('temp_lag_1hr', lag('temperature_2m', 1).over(lag_window))
    df = df.withColumn('temp_lag_3hr', lag('temperature_2m', 3).over(lag_window))
    df = df.withColumn('temp_lag_1day', lag('temperature_2m', 24).over(lag_window))

    # temperature rolling averages
    df = df.withColumn('rolling_temp_avg3hr', avg('temperature_2m').over(rolling_window_3hr))
    df = df.withColumn('rolling_temp_avg6hr', avg('temperature_2m').over(rolling_window_6hr))
    df = df.withColumn('rolling_temp_avg1day', avg('temperature_2m').over(rolling_window_24hr))

    # getting last 25 rows for forecasting purposes
    forecast_data = df.orderBy('datetime', ascending=False).limit(25).orderBy('datetime')
    forecast_data.write.mode('overwrite').parquet(forecast_data_path)

    print(f'{city} forecast data written')

    # dropping all nulls created from lags
    df = df.dropna()

    # vectorizing features, generating model, and saving model
    train_data = assembler.transform(df).select('features', 'temperature_2m')

    # check for cities that need to be trained with a GBT Regressor
    if city in ['Amsterdam', 'Copenhagen', 'Dublin', 'Antwerp']:
        print('Using GBT Regressor')
        forecast_model = gbtr.fit(train_data)
    else:
        forecast_model = lr.fit(train_data)

    forecast_model.write().overwrite().save(model_path)
    print(f'{city} forecast model written')

working on Vienna
Vienna forecast data written
Vienna forecast model written
working on Ljubljana
Ljubljana forecast data written
Ljubljana forecast model written
working on Denver
Denver forecast data written
Denver forecast model written
working on Paris
Paris forecast data written
Paris forecast model written
working on Zurich
Zurich forecast data written
Zurich forecast model written
working on London
London forecast data written
London forecast model written
working on Berlin
Berlin forecast data written
Berlin forecast model written
working on Philadelphia
Philadelphia forecast data written
Philadelphia forecast model written
working on Indianapolis
Indianapolis forecast data written
Indianapolis forecast model written
working on Nashville
Nashville forecast data written
Nashville forecast model written
working on Chicago
Chicago forecast data written
Chicago forecast model written
working on New York
New York forecast data written
New York forecast model written
working on Miami