In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Import Data

In [2]:
data = pd.read_csv("Temp_Data.csv")

In [3]:
data.head()

Unnamed: 0,DATE,Temp,relative_humidity,wind_speed,Pressure
0,1/1/2020,-0.44,86.0,18.5,100.15
1,1/2/2020,-0.09,79.0,14.0,100.47
2,1/3/2020,2.95,82.0,12.0,100.64
3,1/4/2020,-0.8,86.0,15.0,100.75
4,1/5/2020,-7.95,74.5,15.0,101.06


Target varible is Temp

## Feature engineering

Functions to create features that will be used by XGBoost

#### Lags

In [4]:
def create_lag_features(data, column = "Temp", lag_steps=1):
    
    for i in range(1,lag_steps+1):
        data[f'{column}_lag_{i}'] = data[column].shift(i)
    
    return data

In [5]:
for column in ["Temp", "relative_humidity", "wind_speed", "Pressure"]:
    for i in range(1,4,1):
        lagged_data = create_lag_features(data,column,i)

In [6]:
lagged_data.head()

Unnamed: 0,DATE,Temp,relative_humidity,wind_speed,Pressure,Temp_lag_1,Temp_lag_2,Temp_lag_3,relative_humidity_lag_1,relative_humidity_lag_2,relative_humidity_lag_3,wind_speed_lag_1,wind_speed_lag_2,wind_speed_lag_3,Pressure_lag_1,Pressure_lag_2,Pressure_lag_3
0,1/1/2020,-0.44,86.0,18.5,100.15,,,,,,,,,,,,
1,1/2/2020,-0.09,79.0,14.0,100.47,-0.44,,,86.0,,,18.5,,,100.15,,
2,1/3/2020,2.95,82.0,12.0,100.64,-0.09,-0.44,,79.0,86.0,,14.0,18.5,,100.47,100.15,
3,1/4/2020,-0.8,86.0,15.0,100.75,2.95,-0.09,-0.44,82.0,79.0,86.0,12.0,14.0,18.5,100.64,100.47,100.15
4,1/5/2020,-7.95,74.5,15.0,101.06,-0.8,2.95,-0.09,86.0,82.0,79.0,15.0,12.0,14.0,100.75,100.64,100.47


#### Rolling Mean

In [7]:
def create_rolling_mean(data, column = "Temp", window_size = 7):

    data[f"{column}_rolling_mean_{window_size}"] = data[column].rolling(window = window_size).mean()
    return data

In [8]:
for column in ["Temp", "relative_humidity", "wind_speed", "Pressure"]:
    for i in range(7,22,7):
        rolled_data = create_rolling_mean(lagged_data,column,i)

In [9]:
rolled_data.head(21)

Unnamed: 0,DATE,Temp,relative_humidity,wind_speed,Pressure,Temp_lag_1,Temp_lag_2,Temp_lag_3,relative_humidity_lag_1,relative_humidity_lag_2,relative_humidity_lag_3,wind_speed_lag_1,wind_speed_lag_2,wind_speed_lag_3,Pressure_lag_1,Pressure_lag_2,Pressure_lag_3,Temp_rolling_mean_7,Temp_rolling_mean_14,Temp_rolling_mean_21,relative_humidity_rolling_mean_7,relative_humidity_rolling_mean_14,relative_humidity_rolling_mean_21,wind_speed_rolling_mean_7,wind_speed_rolling_mean_14,wind_speed_rolling_mean_21,Pressure_rolling_mean_7,Pressure_rolling_mean_14,Pressure_rolling_mean_21
0,1/1/2020,-0.44,86.0,18.5,100.15,,,,,,,,,,,,,,,,,,,,,,,,
1,1/2/2020,-0.09,79.0,14.0,100.47,-0.44,,,86.0,,,18.5,,,100.15,,,,,,,,,,,,,,
2,1/3/2020,2.95,82.0,12.0,100.64,-0.09,-0.44,,79.0,86.0,,14.0,18.5,,100.47,100.15,,,,,,,,,,,,,
3,1/4/2020,-0.8,86.0,15.0,100.75,2.95,-0.09,-0.44,82.0,79.0,86.0,12.0,14.0,18.5,100.64,100.47,100.15,,,,,,,,,,,,
4,1/5/2020,-7.95,74.5,15.0,101.06,-0.8,2.95,-0.09,86.0,82.0,79.0,15.0,12.0,14.0,100.75,100.64,100.47,,,,,,,,,,,,
5,1/6/2020,-7.9,80.0,13.5,100.99,-7.95,-0.8,2.95,74.5,86.0,82.0,15.0,15.0,12.0,101.06,100.75,100.64,,,,,,,,,,,,
6,1/7/2020,-3.7,74.0,10.5,101.19,-7.9,-7.95,-0.8,80.0,74.5,86.0,13.5,15.0,15.0,100.99,101.06,100.75,-2.561429,,,80.214286,,,14.071429,,,100.75,,
7,1/8/2020,-8.15,78.0,24.5,101.87,-3.7,-7.9,-7.95,74.0,80.0,74.5,10.5,13.5,15.0,101.19,100.99,101.06,-3.662857,,,79.071429,,,14.928571,,,100.995714,,
8,1/9/2020,-14.4,67.0,15.0,103.51,-8.15,-3.7,-7.9,78.0,74.0,80.0,24.5,10.5,13.5,101.87,101.19,100.99,-5.707143,,,77.357143,,,15.071429,,,101.43,,
9,1/10/2020,-3.24,86.5,15.0,102.38,-14.4,-8.15,-3.7,67.0,78.0,74.0,15.0,24.5,10.5,103.51,101.87,101.19,-6.591429,,,78.0,,,15.5,,,101.678571,,


#### Fourier Transformation

In [10]:
from scipy.fft import fft

In [11]:

def apply_fourier_transform(data,column):
    values = data[column].values
    f = fft(values)
    data[f"{column}_fft"] = np.abs(f)
    return data

In [12]:
for column in ["Temp", "relative_humidity", "wind_speed", "Pressure"]:
    fourier_data = apply_fourier_transform(rolled_data,column)

## Training

In [14]:
train_size = int(len(fourier_data) * 0.8)
train_data, test_data = fourier_data[21:train_size], fourier_data[train_size:]

In [22]:
X_train = train_data.drop(["Temp"],axis = 1).set_index("DATE")
y_train = train_data[["DATE","Temp"]].set_index("DATE")

In [27]:
X_test = test_data.drop(["Temp"],axis = 1).set_index("DATE")
y_test = test_data[["DATE","Temp"]].set_index("DATE")

In [15]:
train_data.head()

Unnamed: 0,DATE,Temp,relative_humidity,wind_speed,Pressure,Temp_lag_1,Temp_lag_2,Temp_lag_3,relative_humidity_lag_1,relative_humidity_lag_2,relative_humidity_lag_3,wind_speed_lag_1,wind_speed_lag_2,wind_speed_lag_3,Pressure_lag_1,Pressure_lag_2,Pressure_lag_3,Temp_rolling_mean_7,Temp_rolling_mean_14,Temp_rolling_mean_21,relative_humidity_rolling_mean_7,relative_humidity_rolling_mean_14,relative_humidity_rolling_mean_21,wind_speed_rolling_mean_7,wind_speed_rolling_mean_14,wind_speed_rolling_mean_21,Pressure_rolling_mean_7,Pressure_rolling_mean_14,Pressure_rolling_mean_21,Temp_fft,relative_humidity_fft,wind_speed_fft,Pressure_fft
21,1/22/2020,-3.3,71.0,21.5,102.48,-12.45,-14.9,-12.25,70.0,67.0,77.5,15.5,18.0,18.5,102.86,102.7,101.21,-12.5,-9.195,-7.350952,73.0,79.071429,79.071429,19.285714,16.5,15.97619,102.548571,102.485714,101.989048,309.288579,243.25257,159.706275,30.861174
22,1/23/2020,-1.65,74.0,9.5,102.6,-3.3,-12.45,-14.9,71.0,70.0,67.0,21.5,15.5,18.0,102.48,102.86,102.7,-11.378571,-8.284286,-7.425238,71.857143,79.571429,78.833333,17.428571,16.107143,15.761905,102.584286,102.420714,102.090476,226.56165,306.223998,55.700913,19.780606
23,1/24/2020,-1.4,76.0,17.0,102.69,-1.65,-3.3,-12.45,74.0,71.0,70.0,9.5,21.5,15.5,102.6,102.48,102.86,-8.928571,-8.152857,-7.632381,73.142857,78.821429,78.547619,17.071429,16.25,16.0,102.388571,102.442857,102.188095,48.433586,708.088214,39.087506,54.892608
24,1/25/2020,-1.1,85.5,22.5,101.49,-1.4,-1.65,-3.3,76.0,74.0,71.0,17.0,9.5,21.5,102.69,102.6,102.48,-6.721429,-8.442143,-7.646667,74.428571,78.071429,78.52381,17.5,16.642857,16.357143,102.29,102.436429,102.223333,382.734213,212.914487,116.181539,23.940134
25,1/26/2020,0.64,95.5,13.0,100.33,-1.1,-1.4,-1.65,85.5,76.0,74.0,22.5,17.0,9.5,101.49,102.69,102.6,-4.88,-7.910714,-7.237619,77.0,79.035714,79.52381,16.714286,16.285714,16.261905,102.164286,102.315,102.188571,321.694799,397.196824,91.449985,41.356523


In [21]:
fig = px.imshow(train_data.drop(["DATE"], axis=1).corr().round(2), color_continuous_scale='RdBu_r', text_auto=True, aspect="auto",
                                width= 900,
                                height = 850,
                                range_color=[-1,1])
fig.update_xaxes(showgrid=False).update_yaxes(showgrid=False)
fig.show()

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
param_grid = {
    'learning_rate':[0.01,0.1,0.2],
    'max_depth':[3,5,7,10],
    'subsample':[0.8,0.9,1.0]
}

In [19]:
from xgboost import XGBRegressor

In [28]:
grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [29]:
xgb_model = XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train)

In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

predictions = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)

rmse = np.sqrt(mean_squared_error(y_test, predictions))

In [31]:
rmse

3.803062939881526

In [37]:
y_test["prediction"] = predictions

In [38]:
y_test.head()

Unnamed: 0_level_0,Temp,prediction
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2/23/2021,0.95,-2.195663
2/24/2021,-0.7,0.089377
2/25/2021,-7.6,-4.492254
2/26/2021,-7.4,-12.859139
2/27/2021,-4.69,-4.600336


In [45]:
y_test.keys()

Index(['Temp', 'prediction'], dtype='object')

In [57]:
df = pd.melt(y_test.reset_index(), id_vars="DATE",value_vars=list(y_test.keys()), value_name="Temperature")

In [59]:
df.head()

Unnamed: 0,DATE,variable,Temperature
0,2/23/2021,Temp,0.95
1,2/24/2021,Temp,-0.7
2,2/25/2021,Temp,-7.6
3,2/26/2021,Temp,-7.4
4,2/27/2021,Temp,-4.69


In [60]:
fig = px.line(df, x = "DATE", y = "Temperature", color = "variable")
fig.show()

In [None]:
from xgboost import

In [None]:
data['seasonal_feature'] = data['timestamp'].apply(lambda x: seasonal_pattern(x))