####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 30
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: b9068ade-6771-48c9-b1b1-384fb6d56b94
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session b9068ade-6771-48c9-b1b1-384fb6d56b94 to get into ready status...
Session b9068ade-6771-48c9-b1b1-384fb6d56b94 has be

In [2]:
sc._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.DirectFileOutputCommitter")




In [3]:
from pyspark.sql.functions import col, lit, floor, when, concat_ws, to_timestamp, lpad, lag, lead, avg, hour
from pyspark.sql.window import Window
from pyspark.sql import Row
from datetime import datetime, timedelta
import pandas as pd

print('done')

done


In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression

print('done')

done


## Data Preparation (Cleaning and Feature Engineering)

In [6]:
new_york_hourly = spark.read.parquet('s3://de-upskill-weatherforecasting/Gold/hourly_historical.parquet/city=New York/')
new_york_hourly = new_york_hourly.drop('city', 'latitude', 'longitude', 'weather_code', 'time_zone', 'country').orderBy('year', 'month', 'day', 'time')                                                                               
new_york_hourly.tail(5)

[Row(temperature_2m=17.922000885009766, relative_humidity_2m=22.820476531982422, precipitation=0.0, rain=0.0, snowfall=0.0, wind_speed_10m=6.504951477050781, wind_speed_100m=10.699028015136719, wind_direction_10m=284.4208068847656, wind_direction_100m=282.63336181640625, wind_gusts_10m=17.280000686645508, time='20:00:00', day='17', month=11, datetime='2024-11-17 20:00:00', Unix_Time=1731873600, offset_seconds=-14400.0, Local_Time='2024-11-17 16:00:00', year=2024), Row(temperature_2m=16.12200164794922, relative_humidity_2m=39.190242767333984, precipitation=0.0, rain=0.0, snowfall=0.0, wind_speed_10m=2.930187702178955, wind_speed_100m=10.802999496459961, wind_direction_10m=280.6195983886719, wind_direction_100m=280.5609436035156, wind_gusts_10m=15.119998931884766, time='21:00:00', day='17', month=11, datetime='2024-11-17 21:00:00', Unix_Time=1731877200, offset_seconds=-14400.0, Local_Time='2024-11-17 17:00:00', year=2024), Row(temperature_2m=14.272000312805176, relative_humidity_2m=39.21

In [5]:
# seasons
# Winter(0) = 12, 1, 2
# Spring(1) = 3, 4, 5
# Summer(2) = 6, 7, 8
# Fall(3) = 9, 10, 11

new_york_hourly = new_york_hourly.withColumn("season", when(col("month").isin([12, 1, 2]), 0)
                         .when(col("month").isin([3, 4, 5]), 1)
                         .when(col("month").isin([6, 7, 8]), 2)
                         .otherwise(3))
new_york_hourly.show(5)

+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+-----+-------------------+----------+--------------+-------------------+----+------+
|     temperature_2m|relative_humidity_2m|      precipitation|               rain|           snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|    wind_gusts_10m|    time|day|month|           datetime| Unix_Time|offset_seconds|         Local_Time|year|season|
+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+-----+-------------------+----------+--------------+-------------------+----+------+
|-1.5670000314712524|    96.3861312866211|                0.5|0.09999999403953552| 0.2800000011920929| 3.5455887

In [6]:
# reconcating timestamp
new_york_hourly = new_york_hourly.withColumn('timestamp', concat_ws(' ', concat_ws('-', col('year'), lpad(col('month'), 2, '0'), lpad(col('day'), 2, '0')), col('time')))
new_york_hourly = new_york_hourly.withColumn('timestamp', to_timestamp(col('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
new_york_hourly = new_york_hourly.withColumn('hour', hour(col('timestamp')))
new_york_hourly = new_york_hourly.withColumn('day', col('day').astype('int'))
new_york_hourly.show(5)

+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+-----+-------------------+----------+--------------+-------------------+----+------+-------------------+----+
|     temperature_2m|relative_humidity_2m|      precipitation|               rain|           snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|    wind_gusts_10m|    time|day|month|           datetime| Unix_Time|offset_seconds|         Local_Time|year|season|          timestamp|hour|
+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+--------+---+-----+-------------------+----------+--------------+-------------------+----+------+-------------------+----+
|-1.5670000314712524|    96.386131286

In [7]:
# reordering columns
new_york_hourly = new_york_hourly.select('timestamp', 'year', 'month', 'day', 'time', 'hour', 'temperature_2m', 'relative_humidity_2m', 'season', 'precipitation', 'rain', 'snowfall', 'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m', 'wind_direction_100m', 'wind_gusts_10m')
new_york_hourly.show(5)

+-------------------+----+-----+---+--------+----+-------------------+--------------------+------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+
|          timestamp|year|month|day|    time|hour|     temperature_2m|relative_humidity_2m|season|      precipitation|               rain|           snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|    wind_gusts_10m|
+-------------------+----+-----+---+--------+----+-------------------+--------------------+------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+
|2010-01-01 05:00:00|2010|    1|  1|05:00:00|   5|-1.5670000314712524|    96.3861312866211|     0|                0.5|0.09999999403953552| 0.2800000011920929| 3.545588731765747|3.0758414268493652| 336.0375061035156| 20.556127548217773|

In [8]:
# temperature lags
new_york_hourly = new_york_hourly.withColumn('temp_lag_1hr', lag('temperature_2m', 1).over(Window.orderBy('timestamp')))
new_york_hourly = new_york_hourly.withColumn('temp_lag_3hr', lag('temperature_2m', 3).over(Window.orderBy('timestamp')))
new_york_hourly = new_york_hourly.withColumn('temp_lag_12hr', lag('temperature_2m', 12).over(Window.orderBy('timestamp')))
new_york_hourly = new_york_hourly.withColumn('temp_lag_1day', lag('temperature_2m', 24).over(Window.orderBy('timestamp')))
new_york_hourly.show(5)

+-------------------+----+-----+---+--------+----+-------------------+--------------------+------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------+-------------+
|          timestamp|year|month|day|    time|hour|     temperature_2m|relative_humidity_2m|season|      precipitation|               rain|           snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|    wind_gusts_10m|       temp_lag_1hr|       temp_lag_3hr|temp_lag_12hr|temp_lag_1day|
+-------------------+----+-----+---+--------+----+-------------------+--------------------+------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------+-------------+
|2010-01-01 05:00:00|2010|    1

In [9]:
# removing null values
new_york_hourly = new_york_hourly.dropna()




In [10]:
new_york_hourly.select('timestamp', 'temperature_2m', 'temp_lag_1hr', 'temp_lag_3hr', 'temp_lag_12hr', 'temp_lag_1day').show(5)

+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+
|          timestamp|     temperature_2m|       temp_lag_1hr|       temp_lag_3hr|     temp_lag_12hr|      temp_lag_1day|
+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+
|2010-01-02 05:00:00|-1.6669999361038208| -2.767000198364258|-1.3170000314712524|2.0829999446868896|-1.5670000314712524|
|2010-01-02 06:00:00|-1.0169999599456787|-1.6669999361038208|-1.3170000314712524|3.0829999446868896|-1.1169999837875366|
|2010-01-02 07:00:00|-0.7669999599456787|-1.0169999599456787| -2.767000198364258|5.0329999923706055|-0.8669999837875366|
|2010-01-02 08:00:00|-1.1669999361038208|-0.7669999599456787|-1.6669999361038208| 5.183000087738037|-0.9670000076293945|
|2010-01-02 09:00:00|-1.5169999599456787|-1.1669999361038208|-1.0169999599456787| 4.482999801635742|-0.8169999718666077|
+-------------------+-----------

In [11]:
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg3hr', avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-3,0)))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg6hr', avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-6,0)))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg12hr', avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-12,0)))
new_york_hourly = new_york_hourly.withColumn('rolling_temp_avg1day', avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-24,0)))

new_york_hourly.select('timestamp', 'temperature_2m', 'rolling_temp_avg3hr', 'rolling_temp_avg6hr', 'rolling_temp_avg12hr', 'rolling_temp_avg1day').show(5)

+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|          timestamp|     temperature_2m|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|
+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|2010-01-02 05:00:00|-1.6669999361038208|-1.6669999361038208|-1.6669999361038208| -1.6669999361038208| -1.6669999361038208|
|2010-01-02 06:00:00|-1.0169999599456787|-1.3419999480247498|-1.3419999480247498| -1.3419999480247498| -1.3419999480247498|
|2010-01-02 07:00:00|-0.7669999599456787| -1.150333285331726| -1.150333285331726|  -1.150333285331726|  -1.150333285331726|
|2010-01-02 08:00:00|-1.1669999361038208|-1.1544999480247498|-1.1544999480247498| -1.1544999480247498| -1.1544999480247498|
|2010-01-02 09:00:00|-1.5169999599456787|-1.1169999539852142|-1.2269999504089355| -1.2269999504089355| -1.2269999504089355|
+-------

In [12]:
new_york_hourly = new_york_hourly.withColumn('temp_next_hour', lead('temperature_2m', 1).over(Window.orderBy('timestamp')))
new_york_hourly.orderBy('timestamp', ascending=False).show(5)

+-------------------+----+-----+---+--------+----+------------------+--------------------+------+-------------+----+--------+------------------+------------------+------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|    time|hour|    temperature_2m|relative_humidity_2m|season|precipitation|rain|snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|   wind_gusts_10m|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+--------+----+------------------+--------------------+------+-------------+----+--------+------------------+------------------+------------------+-------------------+--

In [13]:
new_york_current = new_york_hourly.orderBy('timestamp', ascending=False).limit(25).orderBy('timestamp')
new_york_current.show(25)

new_york_hourly = new_york_hourly.dropna()

+-------------------+----+-----+---+--------+----+------------------+--------------------+------+-------------+----+--------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|    time|hour|    temperature_2m|relative_humidity_2m|season|precipitation|rain|snowfall|    wind_speed_10m|   wind_speed_100m|wind_direction_10m|wind_direction_100m|    wind_gusts_10m|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+--------+----+------------------+--------------------+------+-------------+----+--------+------------------+------------------+------------------+-------------------+

## Feature Selection (SelectKBest)

In [14]:
new_york_hourly.count()

130291


In [15]:
new_york_hourly_pandas = new_york_hourly.drop('timestamp', 'time').toPandas()
new_york_hourly_pandas

        year  month  ...  rolling_temp_avg1day  temp_next_hour
0       2010      1  ...             -1.667000          -1.017
1       2010      1  ...             -1.342000          -0.767
2       2010      1  ...             -1.150333          -1.167
3       2010      1  ...             -1.154500          -1.517
4       2010      1  ...             -1.227000          -1.817
...      ...    ...  ...                   ...             ...
130286  2024     11  ...             12.592000          10.422
130287  2024     11  ...             12.170000           9.672
130288  2024     11  ...             11.730000           8.722
130289  2024     11  ...             11.294000           7.822
130290  2024     11  ...             10.912000           6.972

[130291 rows x 24 columns]


In [16]:
new_york_hourly_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130291 entries, 0 to 130290
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  130291 non-null  int32  
 1   month                 130291 non-null  int32  
 2   day                   130291 non-null  int32  
 3   hour                  130291 non-null  int32  
 4   temperature_2m        130291 non-null  float64
 5   relative_humidity_2m  130291 non-null  float64
 6   season                130291 non-null  int32  
 7   precipitation         130291 non-null  float64
 8   rain                  130291 non-null  float64
 9   snowfall              130291 non-null  float64
 10  wind_speed_10m        130291 non-null  float64
 11  wind_speed_100m       130291 non-null  float64
 12  wind_direction_10m    130291 non-null  float64
 13  wind_direction_100m   130291 non-null  float64
 14  wind_gusts_10m        130291 non-null  float64
 15  

In [17]:
new_york_hourly_pandas['day'] = new_york_hourly_pandas['day'].astype(int)




In [18]:
x = new_york_hourly_pandas.drop('temp_next_hour', axis=1)
y = new_york_hourly_pandas['temp_next_hour']




In [19]:
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(x, y)

feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

print(feature_scores)

                 Feature         Score
4         temperature_2m  1.395333e+07
15          temp_lag_1hr  4.094971e+06
19   rolling_temp_avg3hr  3.103809e+06
20   rolling_temp_avg6hr  1.526871e+06
16          temp_lag_3hr  1.232665e+06
22  rolling_temp_avg1day  9.983374e+05
21  rolling_temp_avg12hr  8.454218e+05
18         temp_lag_1day  7.040161e+05
17         temp_lag_12hr  3.072609e+05
6                 season  6.260808e+04
1                  month  1.277363e+04
11       wind_speed_100m  7.452625e+03
3                   hour  3.839189e+03
10        wind_speed_10m  2.315938e+03
9               snowfall  1.688764e+03
12    wind_direction_10m  1.420703e+03
13   wind_direction_100m  1.247394e+03
14        wind_gusts_10m  5.969524e+02
8                   rain  3.181317e+02
0                   year  1.642896e+02
7          precipitation  8.723286e+01
2                    day  1.586301e+01
5   relative_humidity_2m  1.146647e+01


## Creating Linear Regression Model

In [20]:
# splitting dataset into test/train
train_data = new_york_hourly.where(col('year') < 2022)
test_data = new_york_hourly.where(col('year') >= 2022)
print(train_data.count(), test_data.count())

105163 25128


In [21]:
feature_columns = ['temperature_2m', 'temp_lag_1hr', 'rolling_temp_avg3hr', 'rolling_temp_avg6hr', 'temp_lag_3hr', 'rolling_temp_avg1day', 'rolling_temp_avg12hr', 'temp_lag_1day', 'temp_lag_12hr', 'season', 'month', 'hour']
feature_columns

['temperature_2m', 'temp_lag_1hr', 'rolling_temp_avg3hr', 'rolling_temp_avg6hr', 'temp_lag_3hr', 'rolling_temp_avg1day', 'rolling_temp_avg12hr', 'temp_lag_1day', 'temp_lag_12hr', 'season', 'month', 'hour']


In [22]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_data).select("features", "temp_next_hour")
test_data = assembler.transform(test_data).select("features", "temp_next_hour")




In [23]:
lr = LinearRegression(featuresCol="features", labelCol="temp_next_hour")
new_york_model = lr.fit(train_data)




In [24]:
predictions = new_york_model.transform(test_data)




In [25]:
predictions.show(25)

+--------------------+------------------+------------------+
|            features|    temp_next_hour|        prediction|
+--------------------+------------------+------------------+
|[8.37199974060058...|  9.17199993133545| 8.266546975357032|
|[9.17199993133545...| 8.022000312805176| 9.563586156519296|
|[8.02200031280517...| 8.071999549865723| 7.456037788622703|
|[8.07199954986572...| 7.671999931335449| 8.044975472387856|
|[7.67199993133544...| 7.822000026702881| 7.439411133723283|
|[7.82200002670288...| 7.921999931335449| 7.881595469002741|
|[7.92199993133544...| 7.621999740600586|7.9414511159771894|
|[7.62199974060058...| 7.572000026702881| 7.410601912027726|
|[7.57200002670288...| 7.771999835968018| 7.548476265991729|
|[7.77199983596801...| 8.321999549865723| 8.012633764594954|
|[8.32199954986572...| 8.522000312805176| 8.842752253440985|
|[8.52200031280517...| 8.121999740600586| 8.903403742802858|
|[8.12199974060058...| 9.272000312805176| 8.270018432243756|
|[9.27200031280517...| 9

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_rmse = RegressionEvaluator(labelCol="temp_next_hour", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="temp_next_hour", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 value (R2): {r2}")

Root Mean Squared Error (RMSE): 0.75994450628356
R^2 value (R2): 0.9937380609996154


# Creating Temperature Forecast

In [27]:
new_york_current = new_york_current.limit(25)




In [28]:
# dropping unnecessary columns
new_york_current = new_york_current.drop('time', 'relative_humidity_2m', 'precipitation', 'rain', 'snowfall', 'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m', 'wind_direction_100m', 'wind_gusts_10m')
new_york_current.show(30)

+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3| 16.37200164794922| 19.62200164794922|13.972000122070312| 12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
|2024-11-12 01:00:00

In [29]:
# vectorizing historical data
new_york_hourly_data = assembler.transform(new_york_hourly).select('features', 'temp_next_hour')




In [33]:
# training model on data up to current day
new_york_forecast_model = lr.fit(new_york_hourly_data)




In [37]:
new_york_forecast_model.save('s3://de-upskill-weatherforecasting/MachineLearning/new_york_forecast_model')




In [40]:
# vectorizing data for future purposes
new_york_current_data = assembler.transform(new_york_current).select('features', 'temp_next_hour')




In [41]:
# applying model
forecast_predictions = new_york_forecast_model.transform(new_york_current_data)




In [42]:
forecast_predictions.show(25)

+--------------------+------------------+------------------+
|            features|    temp_next_hour|        prediction|
+--------------------+------------------+------------------+
|[15.3219995498657...|13.321999549865723|14.813546725537385|
|[13.3219995498657...|11.821999549865723|12.386376427404917|
|[11.8219995498657...|10.871999740600586|11.108732972484642|
|[10.8719997406005...|10.871999740600586|10.401233650634868|
|[10.8719997406005...|10.821999549865723|10.879220527127277|
|[10.8219995498657...|11.121999740600586|10.828509520568243|
|[11.1219997406005...|12.022000312805176|11.344149267802118|
|[12.0220003128051...|11.821999549865723| 12.60070751229994|
|[11.8219995498657...|10.222000122070312|11.959533343218332|
|[10.2220001220703...| 9.272000312805176|  9.75783037897768|
|[9.27200031280517...| 9.371999740600586|  9.24732749605228|
|[9.37199974060058...| 9.222000122070312| 9.921963238992166|
|[9.22200012207031...| 10.67199993133545| 9.756496018137188|
|[10.6719999313354...|10

In [43]:
# saving predicted temperature
temp_pred = forecast_predictions.tail(1)[0][2]
temp_pred

6.515879509397492


In [44]:
# saving latest timestamp
latest_ts = new_york_current.tail(1)[0][0]
latest_ts

datetime.datetime(2024, 11, 13, 0, 0)


In [45]:
# filling in blank temperature with prediction
new_york_current = new_york_current.fillna(temp_pred)
new_york_current.show(25)

+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3| 16.37200164794922| 19.62200164794922|13.972000122070312| 12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
|2024-11-12 01:00:00

In [46]:
# adding hour to latest timestamp for next row
latest_ts = latest_ts + timedelta(hours=1)




In [47]:
# finding season
# Winter(0) = 12, 1, 2
# Spring(1) = 3, 4, 5
# Summer(2) = 6, 7, 8
# Fall(3) = 9, 10, 11
lts_season = 0

if latest_ts.month in [3,4,5]:
    lts_season = 1
elif latest_ts.month in [6,7,8]:
    lts_season = 2
elif latest_ts.month in [9,10,11]:
    lts_season = 3





In [48]:
# creating new row
new_row = Row(timestamp=latest_ts, year=latest_ts.year, month=latest_ts.month, day=latest_ts.day, hour=latest_ts.hour, temperature_2m=temp_pred, season=lts_season)
new_row_df = spark.createDataFrame([new_row])
new_row_df.show(1)

+-------------------+----+-----+---+----+-----------------+------+
|          timestamp|year|month|day|hour|   temperature_2m|season|
+-------------------+----+-----+---+----+-----------------+------+
|2024-11-13 01:00:00|2024|   11| 13|   1|6.515879509397492|     3|
+-------------------+----+-----+---+----+-----------------+------+


In [49]:
# filling missing cols with nulls
for c in (set(new_york_current.columns) - set(new_row_df.columns)):
    new_row_df = new_row_df.withColumn(c, lit(None))




In [50]:
new_row_df.show(1)

+-------------------+----+-----+---+----+-----------------+------+--------------------+------------+------------+-------------------+-------------+-------------+-------------------+--------------+--------------------+
|          timestamp|year|month|day|hour|   temperature_2m|season|rolling_temp_avg12hr|temp_lag_3hr|temp_lag_1hr|rolling_temp_avg3hr|temp_lag_1day|temp_lag_12hr|rolling_temp_avg6hr|temp_next_hour|rolling_temp_avg1day|
+-------------------+----+-----+---+----+-----------------+------+--------------------+------------+------------+-------------------+-------------+-------------+-------------------+--------------+--------------------+
|2024-11-13 01:00:00|2024|   11| 13|   1|6.515879509397492|     3|                null|        null|        null|               null|         null|         null|               null|          null|                null|
+-------------------+----+-----+---+----+-----------------+------+--------------------+------------+------------+---------------

In [51]:
# adding new row to new_york_current
new_york_current = new_york_current.union(new_row_df)




In [52]:
new_york_current.show(1)

+-------------------+----+-----+---+----+------------------+------+-----------------+-----------------+------------------+-----------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|     temp_lag_1hr|     temp_lag_3hr|     temp_lag_12hr|    temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+-----------------+-----------------+------------------+-----------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3|16.37200164794922|19.62200164794922|13.972000122070312|12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
+-------------------+----+-----+

In [56]:
# reapplying lags on new row
window_spec = Window.orderBy('timestamp')

new_york_current = new_york_current.withColumn('temp_lag_1hr',when(col('temp_lag_1hr').isNull(),lag('temperature_2m', 1).over(window_spec)).otherwise(col('temp_lag_1hr')))
new_york_current = new_york_current.withColumn('temp_lag_3hr',when(col('temp_lag_3hr').isNull(),lag('temperature_2m', 3).over(window_spec)).otherwise(col('temp_lag_3hr')))
new_york_current = new_york_current.withColumn('temp_lag_12hr',when(col('temp_lag_12hr').isNull(),lag('temperature_2m', 12).over(window_spec)).otherwise(col('temp_lag_12hr')))
new_york_current = new_york_current.withColumn('temp_lag_1day',when(col('temp_lag_1day').isNull(),lag('temperature_2m', 24).over(window_spec)).otherwise(col('temp_lag_1day')))




In [203]:
new_york_current.show(26)

+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3| 16.37200164794922| 19.62200164794922|13.972000122070312| 12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
|2024-11-12 01:00:00

In [65]:
# reapplying rolling avgs on new row
new_york_current = new_york_current.withColumn('rolling_temp_avg3hr',when(col('rolling_temp_avg3hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-3,0))).otherwise(col('rolling_temp_avg3hr')))
new_york_current = new_york_current.withColumn('rolling_temp_avg6hr',when(col('rolling_temp_avg6hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-6,0))).otherwise(col('rolling_temp_avg6hr')))
new_york_current = new_york_current.withColumn('rolling_temp_avg12hr',when(col('rolling_temp_avg12hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-12,0))).otherwise(col('rolling_temp_avg12hr')))
new_york_current = new_york_current.withColumn('rolling_temp_avg1day',when(col('rolling_temp_avg1day').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-24,0))).otherwise(col('rolling_temp_avg1day')))

TypeError: 'str' object is not callable


In [208]:
new_york_current.show(26)

+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3| 16.37200164794922| 19.62200164794922|13.972000122070312| 12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
|2024-11-12 01:00:00

In [71]:
month_to_season = {
    12: 0, 1: 0, 2: 0,  # Winter
    3: 1, 4: 1, 5: 1,  # Spring
    6: 2, 7: 2, 8: 2,  # Summer
    9: 3, 10: 3, 11: 3  # Fall
}

lag_window = Window.orderBy('timestamp')





In [69]:
for i in range(24):
    new_york_current_data = assembler.transform(new_york_current).select('features', 'temp_next_hour')
    forecast_predictions = new_york_forecast_model.transform(new_york_current_data)

    temp_pred = forecast_predictions.tail(1)[0][2]

    # getting latest timestamp and adding an hour
    latest_ts = new_york_current.tail(1)[0][0] + timedelta(hours=1)

    new_york_current = new_york_current.fillna(temp_pred)
    # Winter(0) = 12, 1, 2
    # Spring(1) = 3, 4, 5
    # Summer(2) = 6, 7, 8
    # Fall(3) = 9, 10, 11
    lts_season = month_to_season[latest_ts.month]

    # generating new row
    new_row = Row(timestamp=latest_ts,
                  year=latest_ts.year, 
                  month=latest_ts.month, 
                  day=latest_ts.day, hour=latest_ts.hour, 
                  temperature_2m=temp_pred, 
                  season=lts_season,
                  temp_lag_1hr=None,
                  temp_lag_3hr=None,
                  temp_lag_12hr=None,
                  temp_lag_1day=None,
                  rolling_temp_avg3hr=None,
                  rolling_temp_avg6hr=None,
                  rolling_temp_avg12hr=None,
                  rolling_temp_avg1day=None
                 )
    new_row_df = spark.createDataFrame([new_row])

    # adding row to df
    new_york_current = new_york_current.union(new_row_df)

    # reapplying lags on new row

    new_york_current = new_york_current.withColumn('temp_lag_1hr',when(col('temp_lag_1hr').isNull(),lag('temperature_2m', 1).over(Window.orderBy('timestamp'))).otherwise(col('temp_lag_1hr')))
    new_york_current = new_york_current.withColumn('temp_lag_3hr',when(col('temp_lag_3hr').isNull(),lag('temperature_2m', 3).over(Window.orderBy('timestamp'))).otherwise(col('temp_lag_3hr')))
    new_york_current = new_york_current.withColumn('temp_lag_12hr',when(col('temp_lag_12hr').isNull(),lag('temperature_2m', 12).over(Window.orderBy('timestamp'))).otherwise(col('temp_lag_12hr')))
    new_york_current = new_york_current.withColumn('temp_lag_1day',when(col('temp_lag_1day').isNull(),lag('temperature_2m', 24).over(Window.orderBy('timestamp'))).otherwise(col('temp_lag_1day')))

    # reapplying rolling avgs on new row
    new_york_current = new_york_current.withColumn('rolling_temp_avg3hr',when(col('rolling_temp_avg3hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-3,0))).otherwise(col('rolling_temp_avg3hr')))
    new_york_current = new_york_current.withColumn('rolling_temp_avg6hr',when(col('rolling_temp_avg6hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-6,0))).otherwise(col('rolling_temp_avg6hr')))
    new_york_current = new_york_current.withColumn('rolling_temp_avg12hr',when(col('rolling_temp_avg12hr').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-12,0))).otherwise(col('rolling_temp_avg12hr')))
    new_york_current = new_york_current.withColumn('rolling_temp_avg1day',when(col('rolling_temp_avg1day').isNull(),avg('temperature_2m').over(Window.orderBy('timestamp').rowsBetween(-24,0))).otherwise(col('rolling_temp_avg1day')))




In [70]:
new_york_current.show(100)

+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|          timestamp|year|month|day|hour|    temperature_2m|season|      temp_lag_1hr|      temp_lag_3hr|     temp_lag_12hr|     temp_lag_1day|rolling_temp_avg3hr|rolling_temp_avg6hr|rolling_temp_avg12hr|rolling_temp_avg1day|    temp_next_hour|
+-------------------+----+-----+---+----+------------------+------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+
|2024-11-12 00:00:00|2024|   11| 12|   0|15.321999549865723|     3| 16.37200164794922| 19.62200164794922|13.972000122070312| 12.42199993133545| 17.172001123428345| 18.743429592677526|  17.925846906808708|  15.732000312805177|13.321999549865723|
|2024-11-12 01:00:00