We construct rolling (24hr,1 week) * (maxTemp,temp HDD ,temp CDD) features in the dataset. We remove features that aren't present in the weather forecast data (as of Sept 18), because the dropped features aren't high up in the feature importance of previously trained xgboost model and/or have high correlation with temperature, and because we want to be able to make predictions on forecast data by training on historical data.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Import required libraries
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [2]:
#load data/AllWeathDTypes_NWPP_10yrs.csv as a dataframe
DFX = pd.read_csv("../data/AllWeath_regNWPP_2021-2025.csv").drop(['electric_cdd', 'electric_hdd', 'gas_cdd', 'gas_hdd', 'population_cdd', 'population_hdd', 'temperature_avg', 'temperature_max', 'temperature_min', 'weighted_cdd', 'weighted_hdd'], axis=1)
#subtract 7 hours from datetime to convert from UTC to PST
# station_utc_offset = {"Tacoma": 7, "Boise": 6, "Eugene": 7, "Portland": 7, "NWPP": 7, "Vancouver": 7, "Yakima": 7, "Pendleton": 7, "Billings": 6, "Medford": 7,
#     "Reno": 7, "Salt Lake City": 6, "Great Falls": 6, "Klamath Falls": 7, "Spokane": 7, "Seattle": 7}
# station_utc_offset = {station: 7 for station in ["Tacoma", "Boise", "Eugene", "Portland", "NWPP", "Vancouver", "Yakima", "Pendleton", "Billings", "Medford",
#     "Reno", "Salt Lake City", "Great Falls", "Klamath Falls", "Spokane", "Seattle"]}
DFX['datetime'] = pd.to_datetime(DFX['datetime'])
# if set(DFX['station_name'].unique()) != set(station_utc_offset.keys()):
#     raise ValueError("Station names in DFX do not match keys in station_utc_offset!")
# DFX['datetime'] = DFX.apply(lambda row: row['datetime'] - pd.Timedelta(hours=station_utc_offset.get(row['station_name'],7)), axis=1)
DFX = DFX[DFX['temperature'] >= 0]

In [3]:
DFX.head()

Unnamed: 0,station_name,datetime,cloudCover,dewpoint,heatIndex,relativeHumidity,temperature,windChill,windDirection,windSpeed
0,Billings,2021-08-31 17:00:00,0.0,43.0,83.0,22.0,86.0,86.0,50.0,7.0
1,Billings,2021-08-31 18:00:00,19.0,45.0,81.0,26.0,83.0,83.0,300.0,5.0
2,Billings,2021-08-31 19:00:00,75.0,43.0,78.0,29.0,78.0,78.0,0.0,0.0
3,Billings,2021-08-31 20:00:00,50.0,40.0,75.0,28.0,75.0,75.0,320.0,21.0
4,Billings,2021-08-31 21:00:00,0.0,36.0,75.0,24.0,75.0,75.0,10.0,18.0


In [7]:
DFX[(DFX['station_name']=='Billings')&(DFX['datetime'].dt.date == pd.to_datetime('2025-09-15').date())]

Unnamed: 0,station_name,datetime,cloudCover,dewpoint,heatIndex,relativeHumidity,temperature,windChill,windDirection,windSpeed
35406,Billings,2025-09-15 00:00:00,100.0,42.0,72.0,34.0,72.0,72.0,230.0,11.0
35407,Billings,2025-09-15 01:00:00,100.0,44.0,68.0,42.0,68.0,68.0,290.0,8.0
35408,Billings,2025-09-15 02:00:00,100.0,46.0,66.0,49.0,66.0,66.0,280.0,13.0
35409,Billings,2025-09-15 03:00:00,75.0,45.0,65.0,48.0,65.0,65.0,260.0,15.0
35410,Billings,2025-09-15 04:00:00,75.0,44.0,64.0,48.0,64.0,64.0,230.0,8.0
35411,Billings,2025-09-15 05:00:00,75.0,45.0,61.0,56.0,61.0,61.0,200.0,6.0
35412,Billings,2025-09-15 06:00:00,25.0,45.0,60.0,57.0,60.0,60.0,230.0,8.0
35413,Billings,2025-09-15 07:00:00,25.0,44.0,59.0,58.0,59.0,59.0,260.0,10.0
35414,Billings,2025-09-15 08:00:00,50.0,44.0,61.0,54.0,61.0,61.0,270.0,6.0
35415,Billings,2025-09-15 09:00:00,50.0,43.0,65.0,45.0,65.0,65.0,350.0,6.0


In [3]:
DFX = DFX.set_index(['station_name', 'datetime'])

In [4]:
# Sort data by station and time
DFX = DFX.sort_values(['station_name', 'datetime'])

# Calculate temperature difference from 65Â°F
DFX['temp_diff'] = DFX['temperature'] - 65
DFX['temp_cdd'] = DFX['temp_diff'].clip(lower=0)  # For CDD: positive differences
DFX['temp_hdd'] = DFX['temp_diff'].clip(upper=0).abs()  # For HDD: absolute negative differences
DFX['temp_diff'] = DFX['temp_diff'].abs()
# Calculate rolling statistics with time-based windows (using closed='left' to exclude current hour)
DFX['24hrMaxTempDelt'] = DFX.groupby(level=0)['temp_diff'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').max().reset_index(level=0, drop=True)
DFX['1WMaxTempDelt'] = DFX.groupby(level=0)['temp_diff'].rolling('7D', on=DFX.index.get_level_values(1), closed='left').max().reset_index(level=0, drop=True)

DFX['24hrTempCDD'] = DFX.groupby(level=0)['temp_cdd'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)
DFX['24hrTempHDD'] = DFX.groupby(level=0)['temp_hdd'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)
DFX['1WTempCDD'] = DFX.groupby(level=0)['temp_cdd'].rolling('7D', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)
DFX['1WTempHDD'] = DFX.groupby(level=0)['temp_hdd'].rolling('7D', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)

#drop the first week of data for each station
DFX = DFX.groupby(level=0).apply(lambda x: x.iloc[168:]).reset_index(level=0, drop=True)


  DFX['24hrMaxTempDelt'] = DFX.groupby(level=0)['temp_diff'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').max().reset_index(level=0, drop=True)
  DFX['24hrTempCDD'] = DFX.groupby(level=0)['temp_cdd'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)
  DFX['24hrTempHDD'] = DFX.groupby(level=0)['temp_hdd'].rolling('24H', on=DFX.index.get_level_values(1), closed='left').sum().reset_index(level=0, drop=True)


In [5]:
DFX= DFX.drop(['heatIndex','relativeHumidity','windChill','temp_diff', 'temp_cdd', 'temp_hdd'], axis=1).reset_index()[['station_name','datetime','cloudCover','dewpoint','temperature','windDirection','windSpeed','24hrMaxTempDelt','1WMaxTempDelt','24hrTempCDD','24hrTempHDD','1WTempCDD','1WTempHDD']]

In [6]:
DFX.head(20)

Unnamed: 0,station_name,datetime,cloudCover,dewpoint,temperature,windDirection,windSpeed,24hrMaxTempDelt,1WMaxTempDelt,24hrTempCDD,24hrTempHDD,1WTempCDD,1WTempHDD
0,Billings,2021-09-08 00:00:00,19.0,36.0,83.0,70.0,12.0,21.0,24.0,179.0,55.0,816.0,663.0
1,Billings,2021-09-08 01:00:00,50.0,38.0,79.0,60.0,8.0,21.0,24.0,179.0,55.0,813.0,663.0
2,Billings,2021-09-08 02:00:00,19.0,38.0,75.0,60.0,8.0,21.0,24.0,176.0,55.0,809.0,663.0
3,Billings,2021-09-08 03:00:00,0.0,38.0,71.0,50.0,8.0,21.0,24.0,171.0,55.0,806.0,663.0
4,Billings,2021-09-08 04:00:00,0.0,35.0,69.0,30.0,8.0,21.0,24.0,164.0,55.0,802.0,663.0
5,Billings,2021-09-08 05:00:00,0.0,35.0,67.0,10.0,7.0,21.0,24.0,162.0,55.0,796.0,663.0
6,Billings,2021-09-08 06:00:00,0.0,33.0,67.0,20.0,5.0,21.0,24.0,161.0,55.0,794.0,663.0
7,Billings,2021-09-08 07:00:00,0.0,33.0,65.0,0.0,0.0,21.0,24.0,163.0,55.0,796.0,663.0
8,Billings,2021-09-08 08:00:00,0.0,33.0,61.0,340.0,3.0,21.0,24.0,163.0,54.0,796.0,662.0
9,Billings,2021-09-08 09:00:00,19.0,34.0,59.0,360.0,6.0,21.0,24.0,163.0,57.0,796.0,664.0


In [7]:
DFX.to_csv("../data/tempFE_ALLWeath_regNWPP_2021-2025.csv", index=False)