In [0]:
from pyspark.sql.functions import col, when, to_timestamp, avg
from pyspark.sql.window import Window
from pyspark.sql import Row

from datetime import datetime, timedelta
import numpy as np
import re

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, LinearRegressionModel, GBTRegressionModel

In [0]:
# assembler and feature columns

feature_columns = ['temp_lag_1hr', 'rolling_temp_avg3hr', 'temp_lag_3hr', 'rolling_temp_avg6hr', 'dew_point_2m', 'season', 'shortwave_radiation', 'month', 'day_of_year', 'pressure_msl', 'surface_pressure', 'hour', 'cloud_cover', 'temp_lag_1day', 'rolling_temp_avg1day']

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [0]:
cities_dim = spark.read.parquet('/mnt/de-upskilling-weather/Silver/cities_dim.parquet/')
cities_list = cities_dim.select('city').rdd.flatMap(lambda x: x).collect()

In [0]:
month_to_season = {
    12: 0, 1: 0, 2: 0,  # Winter
    3: 1, 4: 1, 5: 1,  # Spring
    6: 2, 7: 2, 8: 2,  # Summer
    9: 3, 10: 3, 11: 3  # Fall
}

In [0]:
api_forecasts_df = spark.read.parquet('/mnt/de-upskilling-weather/Gold/7day_hourly_forecast.parquet/')
api_forecasts_df = api_forecasts_df.select('date', 'city', 'dew_point_2m', 'shortwave_radiation', 'surface_pressure', 'pressure_msl', 'cloud_cover').orderBy('city','date')

In [0]:
for city in cities_list:

    # removing whitespace and reading in dataframe
    city_no_ws = re.sub(r'\W+', '', city)
    df = spark.read.parquet(f'/mnt/de-upskilling-weather/Silver/Forecast_Data/{city_no_ws}_forecast_data.parquet/').withColumn('datetime', to_timestamp('datetime'))
    api_forecast = api_forecasts_df.where(col('city') == city)

    # generating temperature list and converting to np array
    temp_list = df.select('temperature_2m').rdd.flatMap(lambda x: x).collect()
    np_temp_list = np.array(temp_list)

    # loading model
    if city in ['Amsterdam', 'Copenhagen', 'Dublin', 'Antwerp']:
        model = GBTRegressionModel.load(f'/mnt/de-upskilling-weather/MachineLearning/{city_no_ws}_forecast_model/')
    else:
        model = LinearRegressionModel.load(f'/mnt/de-upskilling-weather/MachineLearning/{city_no_ws}_forecast_model/')

    # used to create forecast dataframe after
    ts_list = []
    temp_preds = []

    # using latest timestamp to generate last row df and keep track
    current_ts = df.tail(1)[0]['datetime'] + timedelta(hours=1)

    # API forecast variables
    api_forecast = api_forecasts_df.where((col('city') == city) & (col('date') >= current_ts))

    dew_point_2m_list = api_forecast.select('dew_point_2m').rdd.flatMap(lambda x: x).collect()
    shortwave_radiation_list = api_forecast.select('shortwave_radiation').rdd.flatMap(lambda x: x).collect()
    surface_pressure_list = api_forecast.select('surface_pressure').rdd.flatMap(lambda x: x).collect()
    pressure_msl_list = api_forecast.select('pressure_msl').rdd.flatMap(lambda x: x).collect()
    cloud_cover_list = api_forecast.select('cloud_cover').rdd.flatMap(lambda x: x).collect()

    for i in range(168):
        # generating new row
        new_row = Row(month=current_ts.month, 
                    hour=current_ts.hour,
                    datetime=current_ts,
                    day_of_year=current_ts.timetuple()[7],
                    season=month_to_season[current_ts.month],
                    temp_lag_1hr=float(np_temp_list[-1]),
                    temp_lag_3hr=float(np_temp_list[-3]),
                    temp_lag_1day=float(np_temp_list[-24]),
                    rolling_temp_avg3hr=float(np.average(np_temp_list[-3:])),
                    rolling_temp_avg6hr=float(np.average(np_temp_list[-6:])),
                    rolling_temp_avg1day=float(np.average(np_temp_list[-24:])),
                    temperature_2m=float('nan'),
                    dew_point_2m=dew_point_2m_list[i],
                    shortwave_radiation=shortwave_radiation_list[i],
                    pressure_msl=pressure_msl_list[i],
                    surface_pressure=surface_pressure_list[i],
                    cloud_cover=cloud_cover_list[i]
                    )
        current_row_df = spark.createDataFrame([new_row])

        # applying model and getting prediction
        prediction_data = assembler.transform(current_row_df).select('features', 'temperature_2m')

        prediction = model.transform(prediction_data)
        temp_pred = prediction.tail(1)[0][2]

        np_temp_list = np.append(np_temp_list[1:], temp_pred)

        # adding an hour to latest_ts
        ts_list.append(current_ts)
        temp_preds.append(temp_pred)
        current_ts = current_ts + timedelta(hours=1)

    forecast_df = spark.createDataFrame(list(zip(ts_list, temp_preds)), schema=['timestamp', 'pred_temp'])
    forecast_df.write.mode('overwrite').parquet(f'/mnt/de-upskilling-weather/Silver/Forecasts/{city_no_ws}_forecast.parquet/')
    print(f'finished with {city}')

finished with Vienna
finished with Ljubljana
finished with Denver
finished with Paris
finished with Zurich
finished with London
finished with Berlin
finished with Philadelphia
finished with Indianapolis
finished with Nashville
finished with Chicago
finished with New York
finished with Miami
finished with Washington, DC
finished with Oklahoma City
finished with Dallas
finished with Seattle
finished with Los Angeles
finished with Prague
finished with Copenhagen
finished with Amsterdam
finished with Dublin
finished with Antwerp
finished with Munich
finished with Madrid
