In [0]:
from pyspark.sql.functions import col, lit, floor, when, concat_ws, to_timestamp, lpad, lag, lead, avg, hour, dayofmonth, month, year, dayofyear
from pyspark.sql.window import Window
from pyspark.sql import Row
from datetime import datetime, timedelta
import pandas as pd

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.feature_selection import SelectKBest, f_regression

# Data Preparation (Cleaning and Feature Engineering)

For the second iteration of the weather prediction model, we want to drastically improve the accuracy of the model as results from the previous iteration were extremely inaccurate due to recursive predictions.
<br>
This includes making multiple changes and additions:
1. Switch from Linear Regression to Random Forest Regression or Gradient Boosted Regression
2. Utilize weather forecast data to supplement predictions

In [0]:
# ingesting New York data to use for model testing/training
new_york_hourly = spark.read.parquet('/mnt/de-upskilling-weather/Gold/hourly_historical.parquet').where(col('city') == 'New York')

new_york_hourly = new_york_hourly.drop('latitude', 'longitude', 'city', 'offset_seconds', 'country', 'wind_speed_100m', 'wind_direction_10m', 'wind_direction_100m', 'time_zone', 'Unix_Time', 'time').orderBy('datetime')

In [0]:
# This data will be used to supplement future predictions

new_york_hourly_forecast = spark.read.parquet('/mnt/de-upskilling-weather/Gold/7day_hourly_forecast.parquet').where(col('city') == 'New York')
new_york_hourly_forecast = new_york_hourly_forecast.drop('city', 'country', 'latitude', 'longitude', 'timezone', 'weather_code', 'wind_speed_80m', 'wind_speed_120m', 'wind_speed_180m', 'wind_direction_80m', 'wind_direction_120m', 'wind_direction_180m', 'time_zone', 'Time', 'offset_seconds', 'Local_Time').orderBy('date')

In [0]:
# reordering columns and adding hour column
new_york_hourly = new_york_hourly.select('datetime', 'Local_Time', 'year', 'month', 'day', hour('datetime').alias('hour'), 'temperature_2m', 'relative_humidity_2m', 'precipitation', 'rain', 'snowfall', 'wind_speed_10m', 'wind_gusts_10m', 'cloud_cover', 'dew_point_2m', 'shortwave_radiation', 'pressure_msl', 'surface_pressure')

## Feature Engineering
- Engineering as many potentially relevant features to test in SelectKBest and GridSearch

In [0]:
# seasons feature
# Winter(0) = 12, 1, 2
# Spring(1) = 3, 4, 5
# Summer(2) = 6, 7, 8
# Fall(3) = 9, 10, 11

new_york_hourly = new_york_hourly.withColumn("season", when(col("month").isin([12, 1, 2]), 0)
                         .when(col("month").isin([3, 4, 5]), 1)
                         .when(col("month").isin([6, 7, 8]), 2)
                         .otherwise(3))
display(new_york_hourly.limit(50))

datetime,Local_Time,year,month,day,hour,temperature_2m,relative_humidity_2m,precipitation,rain,snowfall,wind_speed_10m,wind_gusts_10m,cloud_cover,dew_point_2m,shortwave_radiation,pressure_msl,surface_pressure,season
2010-01-01 05:00:00,2010-01-01 01:00:00,2010,1,1,5,-1.5670000314712524,96.3861312866211,0.5,0.0999999940395355,0.2800000011920929,3.545588731765747,9.720000267028809,99.0,-2.067000150680542,0.0,1016.5,1012.9271240234376,0
2010-01-01 06:00:00,2010-01-01 02:00:00,2010,1,1,6,-1.1169999837875366,96.0456771850586,0.4000000059604645,0.0999999940395355,0.2100000083446502,3.3190360069274902,10.079999923706056,100.0,-1.6669999361038208,0.0,1015.9000244140624,1012.3352661132812,0
2010-01-01 07:00:00,2010-01-01 03:00:00,2010,1,1,7,-0.8669999837875366,96.7607421875,0.3000000119209289,0.1000000089406967,0.1400000005960464,3.877318382263184,7.559999465942383,99.0,-1.3170000314712524,0.0,1015.5999755859376,1012.0394897460938,0
2010-01-01 08:00:00,2010-01-01 04:00:00,2010,1,1,8,-0.9670000076293944,97.47035217285156,0.1000000014901161,0.0,0.0700000002980232,3.545588731765747,7.559999465942383,95.0,-1.3170000314712524,0.0,1015.5999755859376,1012.0382080078124,0
2010-01-01 09:00:00,2010-01-01 05:00:00,2010,1,1,9,-0.8169999718666077,97.47345733642578,0.0,0.0,0.0,3.415259599685669,6.839999675750732,97.0,-1.1669999361038208,0.0,1015.0,1011.4422607421876,0
2010-01-01 10:00:00,2010-01-01 06:00:00,2010,1,1,10,-0.7170000076293945,97.83265686035156,0.0,0.0,0.0,0.0,5.399999618530273,100.0,-1.0169999599456787,0.0,1014.5,1010.9451904296876,0
2010-01-01 11:00:00,2010-01-01 07:00:00,2010,1,1,11,-0.5170000195503235,97.47962951660156,0.0,0.0,0.0,1.6099690198898315,5.039999961853027,100.0,-0.8669999837875366,0.0,1014.2000122070312,1010.649169921875,0
2010-01-01 12:00:00,2010-01-01 08:00:00,2010,1,1,12,-0.6669999957084656,97.8335418701172,0.0,0.0,0.0,2.595996856689453,5.039999961853027,98.0,-0.9670000076293944,0.0,1014.4000244140624,1010.8463134765624,0
2010-01-01 13:00:00,2010-01-01 09:00:00,2010,1,1,13,-0.6169999837875366,97.47756958007812,0.0,0.0,0.0,3.7585103511810303,6.479999542236328,97.0,-0.9670000076293944,5.0,1014.4000244140624,1010.8472290039062,0
2010-01-01 14:00:00,2010-01-01 10:00:00,2010,1,1,14,0.2329999953508377,91.97692108154295,0.0,0.0,0.0,4.679999828338623,8.640000343322754,86.0,-0.9169999957084656,53.0,1013.9000244140624,1010.3596801757812,0


In [0]:
# Windows for time-series features
lag_window = Window.orderBy('datetime')
rolling_window_3hr = Window.orderBy('datetime').rowsBetween(-3, -1)
rolling_window_6hr = Window.orderBy('datetime').rowsBetween(-6, -1)
rolling_window_12hr = Window.orderBy('datetime').rowsBetween(-12, -1)
rolling_window_24hr = Window.orderBy('datetime').rowsBetween(-24, -1) 

In [0]:
# temperature lags
new_york_hourly = new_york_hourly.withColumn('temp_lag_1hr', lag('temperature_2m', 1).over(lag_window))
new_york_hourly = new_york_hourly.withColumn('temp_lag_3hr', lag('temperature_2m', 3).over(lag_window))
new_york_hourly = new_york_hourly.withColumn('temp_lag_12hr', lag('temperature_2m', 12).over(lag_window))
new_york_hourly = new_york_hourly.withColumn('temp_lag_1day', lag('temperature_2m', 24).over(lag_window))

In [0]:
# rolling averages
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg3hr', avg('temperature_2m').over(rolling_window_3hr))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg6hr', avg('temperature_2m').over(rolling_window_6hr))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg12hr', avg('temperature_2m').over(rolling_window_12hr))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg1day', avg('temperature_2m').over(rolling_window_24hr))

In [0]:
# removing null values generated from lags
new_york_hourly = new_york_hourly.dropna()

In [0]:
new_york_hourly = new_york_hourly.withColumn('day_of_year', dayofyear('datetime'))

# Feature Selection (SelectKBest)

In [0]:
new_york_hourly_pandas = new_york_hourly.drop('datetime', 'Local_Time').toPandas()

In [0]:
x = new_york_hourly_pandas.drop('temperature_2m', axis=1)
y = new_york_hourly_pandas['temperature_2m']

In [0]:
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(x, y)

feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

print(feature_scores)

                 Feature         Score
16          temp_lag_1hr  1.424083e+07
20   rolling_temp_avg3hr  4.556206e+06
17          temp_lag_3hr  2.057509e+06
21   rolling_temp_avg6hr  1.886591e+06
23  rolling_temp_avg1day  9.921568e+05
22  rolling_temp_avg12hr  9.096133e+05
19         temp_lag_1day  7.776869e+05
11          dew_point_2m  6.575459e+05
18         temp_lag_12hr  3.176082e+05
15                season  6.384829e+04
12   shortwave_radiation  2.230421e+04
1                  month  1.193514e+04
24           day_of_year  1.169676e+04
13          pressure_msl  8.708999e+03
14      surface_pressure  7.667229e+03
3                   hour  3.580890e+03
8         wind_speed_10m  1.870870e+03
7               snowfall  1.678817e+03
10           cloud_cover  5.550501e+02
9         wind_gusts_10m  4.710462e+02
6                   rain  3.514937e+02
5          precipitation  1.053755e+02
0                   year  2.588929e+01
2                    day  1.939483e+01
4   relative_humidity_2m 

## SelectKBest Summary/Results
- From the SelectKBest results above, we can clearly see that the time-series generated features (lags and rolling averages) have the greatest correlation with our target variable. 
- Other important features that play a pivotal role include 'dew_point_2m', 'shortwave_radiation' (UV-index variable), 'pressure_msl' (sea level pressure), and 'surface_pressure.'
- As we want to select a good k-value for the number of features to prevent overfitting, a good mix of forecast variables and time-series variables were chosen, leaving us with 15 features used to generate temperature forecasts:
  - 'temp_lag_1hr' - temperature 1 hour ago
  - 'rolling_temp_avg3hr' - average temperature of past 3 hours
  - 'temp_lag_3hr' - temperature 3 hours ago
  - 'rolling_temp_avg6hr' - average temperature of past 6 hours
  - 'temp_lag_1day' - temperature 1 day ago
  - 'rolling_temp_avg1day' - average temperature of past day
  - 'dew_point_2m' - temperature at which dew starts to form
  - 'season' - 0, 1, 2, 3 correlating to Winter, Spring, Summer, Fall
  - 'shortwave_radiation' - UV Index Value
  - 'month' - month of current date
  - 'day_of_year' - day of year (0-365)
  - 'pressure_msl' - sea level barometric pressure
  - 'surface_pressure' - surface level barometric pressure
  - 'hour' - current hour
  - 'cloud_cover' - visibility measurement

# Regression Model Testing

In [0]:
# splitting dataset into test/train
train_data = new_york_hourly.where(col('year') < 2022)
test_data = new_york_hourly.where(col('year') >= 2022)
print(train_data.count(), test_data.count())

105163 26760


In [0]:
feature_columns = ['temp_lag_1hr', 'rolling_temp_avg3hr', 'temp_lag_3hr', 'rolling_temp_avg6hr', 'dew_point_2m', 'season', 'shortwave_radiation', 'month', 'day_of_year', 'pressure_msl', 'surface_pressure', 'hour', 'cloud_cover', 'temp_lag_1day', 'rolling_temp_avg1day']
feature_columns

['temp_lag_1hr',
 'rolling_temp_avg3hr',
 'temp_lag_3hr',
 'rolling_temp_avg6hr',
 'dew_point_2m',
 'season',
 'shortwave_radiation',
 'month',
 'day_of_year',
 'pressure_msl',
 'surface_pressure',
 'hour',
 'cloud_cover',
 'temp_lag_1day',
 'rolling_temp_avg1day']

In [0]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_data).select("features", "temperature_2m")
test_data = assembler.transform(test_data).select("features", "temperature_2m")

In [0]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [0]:
train_data_pandas = train_data.toPandas()
test_data_pandas = test_data.toPandas()

X_train = train_data_pandas[feature_columns]
X_test = test_data_pandas[feature_columns]
Y_train = train_data_pandas['temperature_2m']
Y_test = test_data_pandas['temperature_2m']

In [0]:
# linear regression model
lr = LinearRegression(labelCol="temperature_2m", featuresCol="features")
new_york_lr_model = lr.fit(train_data)

In [0]:
# utilizing SKLearn gridsearch to find the best hyperparameters for RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 7, 10],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                    # 3-fold cross-validation
    scoring="neg_root_mean_squared_error",  # Metric to optimize (RMSE)
    verbose=2,               # Display progress
    n_jobs=-1                # Use all available CPU cores
)

grid_search.fit(X_train, Y_train)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [0]:
print("Best Hyperparameters:", grid_search.best_params_)

# Best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f"Test RMSE: {rmse}")

Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
Test RMSE: 0.6762448107774712


In [0]:
# Gradient Boosting Regressor Grid Search
gbr = GradientBoostingRegressor(random_state=42)

param_grid = {
    "n_estimators": [50, 100, 150, 200],  # Number of boosting stages
    "learning_rate": [0.01, 0.05, 0.1],   # Shrinks the contribution of each tree
    "max_depth": [5, 7, 10]               # Maximum depth of individual trees
}

grid_search_gbr = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    cv=3,                    # 3-fold cross-validation
    scoring="neg_root_mean_squared_error",  # Metric to optimize (RMSE)
    verbose=2,                              # Display progress
    n_jobs=-1
)



In [0]:
grid_search_gbr.fit(X_train, Y_train)

print("Best Hyperparameters:", grid_search_gbr.best_params_)

# Best model
best_model = grid_search_gbr.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f"Test RMSE: {rmse}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200}
Test RMSE: 0.6277181242867712


In [0]:
# training rfr model based on historical data
rfr = RandomForestRegressor(labelCol="temperature_2m", featuresCol="features", numTrees=200, maxDepth=10)
new_york_rfr_model = rfr.fit(train_data)

In [0]:
# training gbt model based on historical data
gbtr = GBTRegressor(labelCol="temperature_2m", featuresCol="features", maxIter=50)
new_york_gbtr_model = gbtr.fit(train_data)

In [0]:
predictions = new_york_lr_model.transform(test_data)

In [0]:
display(predictions)

features,temperature_2m,prediction
"Map(vectorType -> dense, length -> 14, values -> List(10.715499877929688, 10.68216641743978, 10.565499305725098, 10.682166258494059, 9.315499305725098, 0.0, 0.0, 1.0, 1.0, 1021.5999755859375, 1020.12255859375, 0.0, 94.7772216796875, 100.0))",10.11549949645996,10.243365238941948
"Map(vectorType -> dense, length -> 14, values -> List(10.115499496459961, 10.532166481018066, 10.76550006866455, 10.565499623616537, 9.415499687194824, 0.0, 0.0, 1.0, 1.0, 1021.4000244140625, 1019.9226684570312, 1.0, 96.0567855834961, 99.0))",10.01550006866455,10.044083111841363
"Map(vectorType -> dense, length -> 14, values -> List(10.01550006866455, 10.282166481018066, 10.715499877929688, 10.45716635386149, 9.415499687194824, 0.0, 0.0, 1.0, 1.0, 1021.9000244140625, 1020.4225463867188, 2.0, 95.09834289550781, 95.0))",10.165499687194824,10.182868530626545
"Map(vectorType -> dense, length -> 14, values -> List(10.165499687194824, 10.098833084106445, 10.115499496459961, 10.390499750773111, 9.215499877929688, 0.0, 0.0, 1.0, 1.0, 1021.7000122070312, 1020.2218627929688, 3.0, 95.09077453613281, 99.0))",9.965499877929688,10.058948414954315
"Map(vectorType -> dense, length -> 14, values -> List(9.965499877929688, 10.048833211263021, 10.01550006866455, 10.290499846140543, 9.215499877929688, 0.0, 0.0, 1.0, 1.0, 1021.7999877929688, 1020.3216552734375, 4.0, 95.09077453613281, 100.0))",9.965499877929688,9.99288338621738
"Map(vectorType -> dense, length -> 14, values -> List(9.965499877929688, 10.032166481018066, 10.165499687194824, 10.157166481018066, 9.715499877929688, 0.0, 0.0, 1.0, 1.0, 1022.0, 1020.52392578125, 5.0, 95.10967254638672, 100.0))",10.465499877929688,10.42372168225942
"Map(vectorType -> dense, length -> 14, values -> List(10.465499877929688, 10.132166544596354, 9.965499877929688, 10.1154998143514, 10.065499305725098, 0.0, 0.0, 1.0, 1.0, 1021.7999877929688, 1020.327392578125, 6.0, 93.55670166015625, 98.0))",11.065499305725098,11.06463781707184
"Map(vectorType -> dense, length -> 14, values -> List(11.065499305725098, 10.498833020528158, 9.965499877929688, 10.27383311589559, 10.315499305725098, 0.0, 0.0, 1.0, 1.0, 1021.7999877929688, 1020.3291015625, 7.0, 93.259765625, 100.0))",11.36549949645996,11.432748980199616
"Map(vectorType -> dense, length -> 14, values -> List(11.365499496459961, 10.965499560038248, 10.465499877929688, 10.498833020528158, 10.365499496459961, 0.0, 0.0, 1.0, 1.0, 1021.5, 1020.0311279296875, 8.0, 91.43170928955078, 100.0))",11.715499877929688,11.734651372207727
"Map(vectorType -> dense, length -> 14, values -> List(11.715499877929688, 11.382166226704916, 11.065499305725098, 10.757166385650635, 10.365499496459961, 0.0, 25.0, 1.0, 1.0, 1021.2999877929688, 1019.8313598632812, 9.0, 91.73400115966797, 100.0))",11.665499687194824,11.778642904101568


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_rmse = RegressionEvaluator(labelCol="temperature_2m", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="temperature_2m", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 value (R2): {r2}")

Root Mean Squared Error (RMSE): 0.15992545409942469
R^2 value (R2): 0.9992100534044579


# Forecast Generation

In [0]:
new_york_hourly.tail(1)[0][0]

'2025-01-06 23:00:00'

In [0]:
new_york_current = new_york_hourly.drop('Local_Time').where(col('datetime') <= str(new_york_hourly.tail(1)[0][0])).orderBy('datetime', ascending=False).limit(25).orderBy('datetime')
display(new_york_current)

datetime,year,month,day,hour,temperature_2m,relative_humidity_2m,precipitation,rain,snowfall,wind_speed_10m,wind_gusts_10m,cloud_cover,dew_point_2m,shortwave_radiation,pressure_msl,surface_pressure,season,temp_lag_1hr,temp_lag_3hr,temp_lag_12hr,temp_lag_1day,rolling_temp_avg3hr,rolling_temp_avg6hr,rolling_temp_avg12hr,rolling_temp_avg1day,day_of_year
2025-01-05 23:00:00,2025,1,5,23,-2.2280001640319824,55.13146209716797,0.0,0.0,0.0,11.22513198852539,27.35999870300293,48.0,-10.027999877929688,0.0,1018.0,1014.4132690429688,0,-1.777999997138977,0.1220000013709068,-3.4780001640319824,-2.0280001163482666,-0.7780000095566114,-0.5113333476086458,-1.5280000325292349,-2.257166721237202,5
2025-01-06 00:00:00,2025,1,6,0,-2.328000068664551,56.42293548583984,0.0,0.0,0.0,12.362475395202637,27.71999931335449,53.0,-9.82800006866455,0.0,1018.2999877929688,1014.7108764648438,0,-2.2280001640319824,-0.6780000329017639,-3.4780001640319824,-2.2280001640319824,-1.5613333980242412,-0.7696667027970155,-1.423833365862568,-2.2655000565573573,6
2025-01-06 01:00:00,2025,1,6,1,-2.128000020980835,56.92303085327149,0.0,0.0,0.0,12.18960189819336,27.35999870300293,100.0,-9.527999877929688,0.0,1018.0,1014.4144897460938,0,-2.328000068664551,-1.777999997138977,-3.177999973297119,-2.2280001640319824,-2.11133340994517,-1.1030000460644562,-1.328000024581949,-2.269666719250381,6
2025-01-06 02:00:00,2025,1,6,2,-2.328000068664551,59.84490966796875,0.0,0.0,0.0,11.103242874145508,26.63999938964844,99.0,-9.07800006866455,0.0,1017.7999877929688,1014.212646484375,0,-2.128000020980835,-2.2280001640319824,-2.5280001163482666,-2.427999973297119,-2.2280000845591226,-1.503000047057867,-1.2405000285555918,-2.26550004662325,6
2025-01-06 03:00:00,2025,1,6,3,-2.4780001640319824,62.19043350219727,0.0,0.0,0.0,10.09605884552002,24.11999893188477,100.0,-8.727999687194824,0.0,1017.7999877929688,1014.2107543945312,0,-2.328000068664551,-2.328000068664551,-1.6279999017715454,-2.7280001640319824,-2.261333386103312,-1.9113333920637767,-1.2238333579152822,-2.2613333839302263,6
2025-01-06 04:00:00,2025,1,6,4,-2.628000020980835,64.3725814819336,0.0,0.0,0.0,9.007196426391602,21.959999084472656,100.0,-8.428000450134277,0.0,1018.0,1014.4078979492188,0,-2.4780001640319824,-2.128000020980835,-0.977999985218048,-2.9780001640319824,-2.311333417892456,-2.211333413918813,-1.294666713103652,-2.25091671726356,6
2025-01-06 05:00:00,2025,1,6,5,-2.828000068664551,66.10132598876953,0.0,0.0,0.0,8.641874313354492,20.15999984741211,100.0,-8.277999877929688,0.0,1018.0,1014.4053344726562,0,-2.628000020980835,-2.328000068664551,-0.6780000329017639,-3.2780001163482666,-2.4780000845591226,-2.3530000845591226,-1.4321667160838842,-2.236333377969762,6
2025-01-06 06:00:00,2025,1,6,6,-2.7280001640319824,66.63963317871094,0.0,0.0,0.0,8.647496223449707,18.71999931335449,100.0,-8.07800006866455,0.0,1017.7000122070312,1014.1076049804688,0,-2.828000068664551,-2.4780001640319824,-0.328000009059906,-3.427999973297119,-2.644666751225789,-2.453000068664551,-1.6113333857307832,-2.21758337598294,6
2025-01-06 07:00:00,2025,1,6,7,-2.628000020980835,66.9212417602539,0.0,0.0,0.0,7.758814334869385,19.799999237060547,100.0,-7.927999973297119,0.0,1017.4000244140624,1013.8099975585938,0,-2.7280001640319824,-2.628000020980835,0.2719999849796295,-3.628000020980835,-2.7280000845591226,-2.519666751225789,-1.8113333986451228,-2.18841671726356,6
2025-01-06 08:00:00,2025,1,6,8,-2.578000068664551,67.45279693603516,0.0,0.0,0.0,7.613381862640381,16.919998168945312,100.0,-7.777999877929687,0.0,1017.4000244140624,1013.8106079101562,0,-2.628000020980835,-2.828000068664551,0.1220000013709068,-3.677999973297119,-2.7280000845591226,-2.6030000845591226,-2.053000065808495,-2.146750050596893,6


In [0]:
new_york_current = new_york_current.drop('year', 'day', 'precipitation', 'dayornight', 'precipitation_check', 'wind_gusts_10m').withColumn('datetime', to_timestamp('datetime'))
display(new_york_current)

datetime,month,hour,temperature_2m,relative_humidity_2m,rain,snowfall,wind_speed_10m,cloud_cover,dew_point_2m,shortwave_radiation,pressure_msl,surface_pressure,season,temp_lag_1hr,temp_lag_3hr,temp_lag_12hr,temp_lag_1day,rolling_temp_avg3hr,rolling_temp_avg6hr,rolling_temp_avg12hr,rolling_temp_avg1day,day_of_year
2025-01-05T23:00:00Z,1,23,-2.2280001640319824,55.13146209716797,0.0,0.0,11.22513198852539,48.0,-10.027999877929688,0.0,1018.0,1014.4132690429688,0,-1.777999997138977,0.1220000013709068,-3.4780001640319824,-2.0280001163482666,-0.7780000095566114,-0.5113333476086458,-1.5280000325292349,-2.257166721237202,5
2025-01-06T00:00:00Z,1,0,-2.328000068664551,56.42293548583984,0.0,0.0,12.362475395202637,53.0,-9.82800006866455,0.0,1018.2999877929688,1014.7108764648438,0,-2.2280001640319824,-0.6780000329017639,-3.4780001640319824,-2.2280001640319824,-1.5613333980242412,-0.7696667027970155,-1.423833365862568,-2.2655000565573573,6
2025-01-06T01:00:00Z,1,1,-2.128000020980835,56.92303085327149,0.0,0.0,12.18960189819336,100.0,-9.527999877929688,0.0,1018.0,1014.4144897460938,0,-2.328000068664551,-1.777999997138977,-3.177999973297119,-2.2280001640319824,-2.11133340994517,-1.1030000460644562,-1.328000024581949,-2.269666719250381,6
2025-01-06T02:00:00Z,1,2,-2.328000068664551,59.84490966796875,0.0,0.0,11.103242874145508,99.0,-9.07800006866455,0.0,1017.7999877929688,1014.212646484375,0,-2.128000020980835,-2.2280001640319824,-2.5280001163482666,-2.427999973297119,-2.2280000845591226,-1.503000047057867,-1.2405000285555918,-2.26550004662325,6
2025-01-06T03:00:00Z,1,3,-2.4780001640319824,62.19043350219727,0.0,0.0,10.09605884552002,100.0,-8.727999687194824,0.0,1017.7999877929688,1014.2107543945312,0,-2.328000068664551,-2.328000068664551,-1.6279999017715454,-2.7280001640319824,-2.261333386103312,-1.9113333920637767,-1.2238333579152822,-2.2613333839302263,6
2025-01-06T04:00:00Z,1,4,-2.628000020980835,64.3725814819336,0.0,0.0,9.007196426391602,100.0,-8.428000450134277,0.0,1018.0,1014.4078979492188,0,-2.4780001640319824,-2.128000020980835,-0.977999985218048,-2.9780001640319824,-2.311333417892456,-2.211333413918813,-1.294666713103652,-2.25091671726356,6
2025-01-06T05:00:00Z,1,5,-2.828000068664551,66.10132598876953,0.0,0.0,8.641874313354492,100.0,-8.277999877929688,0.0,1018.0,1014.4053344726562,0,-2.628000020980835,-2.328000068664551,-0.6780000329017639,-3.2780001163482666,-2.4780000845591226,-2.3530000845591226,-1.4321667160838842,-2.236333377969762,6
2025-01-06T06:00:00Z,1,6,-2.7280001640319824,66.63963317871094,0.0,0.0,8.647496223449707,100.0,-8.07800006866455,0.0,1017.7000122070312,1014.1076049804688,0,-2.828000068664551,-2.4780001640319824,-0.328000009059906,-3.427999973297119,-2.644666751225789,-2.453000068664551,-1.6113333857307832,-2.21758337598294,6
2025-01-06T07:00:00Z,1,7,-2.628000020980835,66.9212417602539,0.0,0.0,7.758814334869385,100.0,-7.927999973297119,0.0,1017.4000244140624,1013.8099975585938,0,-2.7280001640319824,-2.628000020980835,0.2719999849796295,-3.628000020980835,-2.7280000845591226,-2.519666751225789,-1.8113333986451228,-2.18841671726356,6
2025-01-06T08:00:00Z,1,8,-2.578000068664551,67.45279693603516,0.0,0.0,7.613381862640381,100.0,-7.777999877929687,0.0,1017.4000244140624,1013.8106079101562,0,-2.628000020980835,-2.828000068664551,0.1220000013709068,-3.677999973297119,-2.7280000845591226,-2.6030000845591226,-2.053000065808495,-2.146750050596893,6


In [0]:
import re, numpy as np

In [0]:
month_to_season = {
    12: 0, 1: 0, 2: 0,  # Winter
    3: 1, 4: 1, 5: 1,  # Spring
    6: 2, 7: 2, 8: 2,  # Summer
    9: 3, 10: 3, 11: 3  # Fall
}

In [0]:
temp_list = new_york_current.select('temperature_2m').rdd.flatMap(lambda x: x).collect()
np_temp_list = np.array(temp_list)
np_temp_list

array([-2.22800016, -2.32800007, -2.12800002, -2.32800007, -2.47800016,
       -2.62800002, -2.82800007, -2.72800016, -2.62800002, -2.57800007,
       -2.62800002, -2.42799997, -2.22800016, -2.12800002, -2.12800002,
       -2.02800012, -1.72799993, -1.82799995, -1.92799997, -1.82799995,
       -1.82799995, -1.72799993, -1.6279999 , -1.778     , -2.17799997])

In [0]:
# used to create forecast dataframe after
ts_list = []
temp_preds = []

# using latest timestamp to generate last row df and keep track
current_ts = new_york_current.tail(1)[0]['datetime'] + timedelta(hours=1)
new_york_api_forecast = new_york_hourly_forecast.where(col('date') >= current_ts)

dew_point_2m_list = new_york_api_forecast.select('dew_point_2m').rdd.flatMap(lambda x: x).collect()
shortwave_radiation_list = new_york_api_forecast.select('shortwave_radiation').rdd.flatMap(lambda x: x).collect()
surface_pressure_list = new_york_api_forecast.select('surface_pressure').rdd.flatMap(lambda x: x).collect()
pressure_msl_list = new_york_api_forecast.select('pressure_msl').rdd.flatMap(lambda x: x).collect()
relative_humidity_list = new_york_api_forecast.select('relative_humidity_2m').rdd.flatMap(lambda x: x).collect()
cloud_cover_list = new_york_api_forecast.select('cloud_cover').rdd.flatMap(lambda x: x).collect()

for i in range(168):
    # generating new row
    new_row = Row(month=current_ts.month, 
                  hour=current_ts.hour,
                  datetime=current_ts,
                  day_of_year=current_ts.timetuple()[7],
                  season=month_to_season[current_ts.month],
                  temp_lag_1hr=float(np_temp_list[-1]),
                  temp_lag_3hr=float(np_temp_list[-3]),
                  rolling_temp_avg3hr=float(np.average(np_temp_list[-3:])),
                  rolling_temp_avg6hr=float(np.average(np_temp_list[-6:])),
                  temperature_2m=float('nan'),
                  cloud_cover=cloud_cover_list[i],
                  dew_point_2m=dew_point_2m_list[i],
                  shortwave_radiation=shortwave_radiation_list[i],
                  pressure_msl=pressure_msl_list[i],
                  surface_pressure=surface_pressure_list[i],
                  relative_humidity_2m=relative_humidity_list[i]
                 )
    current_row_df = spark.createDataFrame([new_row])

    # applying model and getting prediction
    prediction_data = assembler.transform(current_row_df).select('features', 'temperature_2m')

    prediction = new_york_lr_model.transform(prediction_data)
    temp_pred = prediction.tail(1)[0][2]

    np_temp_list = np.append(np_temp_list[1:], temp_pred)

    # adding an hour to latest_ts
    ts_list.append(current_ts)
    temp_preds.append(temp_pred)
    current_ts = current_ts + timedelta(hours=1)
    
forecast_df = spark.createDataFrame(list(zip(ts_list, temp_preds)), schema=['timestamp', 'pred_temp'])

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-682966357319655>, line 38[0m
[1;32m     35[0m current_row_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([new_row])
[1;32m     37[0m [38;5;66;03m# applying model and getting prediction[39;00m
[0;32m---> 38[0m prediction_data [38;5;241m=[39m assembler[38;5;241m.[39mtransform(current_row_df)[38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mtemperature_2m[39m[38;5;124m'[39m)
[1;32m     40[0m prediction [38;5;241m=[39m new_york_lr_model[38;5;241m.[39mtransform(prediction_data)
[1;32m     41[0m temp_pred [38;5;241m=[39m prediction[38;5;241m.[39mtail([38;5;241m1[39m)[[38;5;241m0[39m][[38;5;241m2[39m]

File [0;32m/databricks/spark/python/pyspark/ml/base.py:260[0m, in [0;36mTransformer.transfo

In [0]:
display(forecast_df)

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-682966357319655>, line 38[0m
[1;32m     35[0m current_row_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([new_row])
[1;32m     37[0m [38;5;66;03m# applying model and getting prediction[39;00m
[0;32m---> 38[0m prediction_data [38;5;241m=[39m assembler[38;5;241m.[39mtransform(current_row_df)[38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mtemperature_2m[39m[38;5;124m'[39m)
[1;32m     40[0m prediction [38;5;241m=[39m new_york_lr_model[38;5;241m.[39mtransform(prediction_data)
[1;32m     41[0m temp_pred [38;5;241m=[39m prediction[38;5;241m.[39mtail([38;5;241m1[39m)[[38;5;241m0[39m][[38;5;241m2[39m]

File [0;32m/databricks/spark/python/pyspark/ml/base.py:260[0m, in [0;36mTransformer.transfo

In [0]:
results = forecast_df.join(new_york_hourly_forecast, on=forecast_df.timestamp == new_york_hourly_forecast.date, how='left').select('timestamp', 'pred_temp', 'temperature_2m', 'cloud_cover', 'dew_point_2m', 'shortwave_radiation', 'pressure_msl', 'surface_pressure', 'relative_humidity_2m')
display(results)

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-682966357319655>, line 38[0m
[1;32m     35[0m current_row_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([new_row])
[1;32m     37[0m [38;5;66;03m# applying model and getting prediction[39;00m
[0;32m---> 38[0m prediction_data [38;5;241m=[39m assembler[38;5;241m.[39mtransform(current_row_df)[38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mtemperature_2m[39m[38;5;124m'[39m)
[1;32m     40[0m prediction [38;5;241m=[39m new_york_lr_model[38;5;241m.[39mtransform(prediction_data)
[1;32m     41[0m temp_pred [38;5;241m=[39m prediction[38;5;241m.[39mtail([38;5;241m1[39m)[[38;5;241m0[39m][[38;5;241m2[39m]

File [0;32m/databricks/spark/python/pyspark/ml/base.py:260[0m, in [0;36mTransformer.transfo

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_rmse = RegressionEvaluator(labelCol="temperature_2m", predictionCol="pred_temp", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="temperature_2m", predictionCol="pred_temp", metricName="r2")

rmse = evaluator_rmse.evaluate(results)
r2 = evaluator_r2.evaluate(results)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 value (R2): {r2}")

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-682966357319655>, line 38[0m
[1;32m     35[0m current_row_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([new_row])
[1;32m     37[0m [38;5;66;03m# applying model and getting prediction[39;00m
[0;32m---> 38[0m prediction_data [38;5;241m=[39m assembler[38;5;241m.[39mtransform(current_row_df)[38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mtemperature_2m[39m[38;5;124m'[39m)
[1;32m     40[0m prediction [38;5;241m=[39m new_york_lr_model[38;5;241m.[39mtransform(prediction_data)
[1;32m     41[0m temp_pred [38;5;241m=[39m prediction[38;5;241m.[39mtail([38;5;241m1[39m)[[38;5;241m0[39m][[38;5;241m2[39m]

File [0;32m/databricks/spark/python/pyspark/ml/base.py:260[0m, in [0;36mTransformer.transfo

In [0]:
import pyspark.pandas as ps
import matplotlib.pyplot as plt

# Set the plotting backend to matplotlib
ps.options.plotting.backend = "matplotlib"

results_pandas = results.pandas_api()
results_pandas.plot(x='timestamp', kind='line', subplots=True)
plt.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-682966357319655>, line 38[0m
[1;32m     35[0m current_row_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([new_row])
[1;32m     37[0m [38;5;66;03m# applying model and getting prediction[39;00m
[0;32m---> 38[0m prediction_data [38;5;241m=[39m assembler[38;5;241m.[39mtransform(current_row_df)[38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mtemperature_2m[39m[38;5;124m'[39m)
[1;32m     40[0m prediction [38;5;241m=[39m new_york_lr_model[38;5;241m.[39mtransform(prediction_data)
[1;32m     41[0m temp_pred [38;5;241m=[39m prediction[38;5;241m.[39mtail([38;5;241m1[39m)[[38;5;241m0[39m][[38;5;241m2[39m]

File [0;32m/databricks/spark/python/pyspark/ml/base.py:260[0m, in [0;36mTransformer.transfo