# Multi-Station Supervised models

## General Imports

In [1]:
import os
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.dummy import DummyRegressor
import altair as alt
from sklearn.model_selection import cross_val_score
from skopt.space import Integer
from skopt.utils import use_named_args
import numpy as np
from skopt import gp_minimize
from skopt.plots import plot_convergence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Small dataset (2022 only) 
### Get cleaned data from pickle file 

In [2]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data_cleaning','cleanweathersmall.pkl')

In [3]:
df = pd.read_pickle(cln_pkl_loc)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3765246 entries, 0 to 3765245
Data columns (total 9 columns):
 #   Column   Dtype         
---  ------   -----         
 0   station  object        
 1   time     datetime64[ns]
 2   temp     float64       
 3   dwpt     float64       
 4   rhum     float64       
 5   prcp     float64       
 6   wdir     float64       
 7   wspd     float64       
 8   pres     float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 258.5+ MB


### Basic data cleaning to build necessary features

In [6]:
pivoted_df = df.pivot(index='time', columns='station', values=['temp', 'dwpt','rhum','prcp','wdir','wspd','pres'])
pivoted_df.columns = ['_'.join(col) for col in pivoted_df.columns.values]
pivoted_df

Unnamed: 0_level_0,temp_0CNUO,temp_0CO7B,temp_0FV1F,temp_1J1PJ,temp_1JWST,temp_20QWH,temp_21O3U,temp_2W8UZ,temp_3S56J,temp_4DUJO,...,pres_W5F5F,pres_X9FED,pres_XM44W,pres_Y59TE,pres_Z7ZOG,pres_ZFZUV,pres_ZNWZW,pres_ZUQJS,pres_ZWC6W,pres_ZYITU
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,-25.1,5.0,1.2,-0.9,-6.5,-28.7,-29.0,4.6,-26.5,1.8,...,,,1013.5,,,1023.8,,,1010.8,
2022-01-01 01:00:00,-25.7,5.1,0.9,-0.6,-7.5,-29.5,-28.9,4.6,-27.3,1.6,...,,,1014.6,,,1024.2,,,1011.8,
2022-01-01 02:00:00,-26.2,4.2,1.5,-0.8,-8.6,-30.2,-30.1,4.7,-27.9,0.9,...,,,1015.1,,,1024.9,,,1012.7,
2022-01-01 03:00:00,-26.1,5.5,4.1,-0.9,-9.4,-29.9,-30.1,4.6,-28.1,1.0,...,,,1016.0,,,1025.4,,,1013.6,
2022-01-01 04:00:00,-26.5,5.9,3.1,-1.2,-10.1,-30.4,-31.4,4.9,-28.6,0.5,...,,,1017.3,,,1025.4,,,1013.9,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-22 19:00:00,12.4,17.8,13.7,11.5,10.2,13.5,12.5,15.2,10.8,12.4,...,1022.1,1025.2,1022.6,1025.0,1025.3,1024.5,1025.4,1022.6,1017.8,1016.7
2022-09-22 20:00:00,13.0,17.0,14.3,12.0,10.4,14.0,12.5,14.8,11.4,11.6,...,1020.3,1024.4,1022.4,1024.0,1024.0,1023.1,1025.0,1022.2,1017.9,1016.0
2022-09-22 21:00:00,13.2,16.6,13.9,12.4,10.5,14.3,12.4,15.1,11.7,10.8,...,1019.6,1024.2,1022.2,1023.0,1023.7,1022.0,1024.0,1022.4,1018.1,1017.4
2022-09-22 22:00:00,11.9,16.2,12.4,12.5,9.9,13.5,12.7,14.4,11.8,9.9,...,1018.5,1023.3,1021.9,1023.0,1023.1,1021.1,1023.0,1022.3,1018.4,1017.8


### Our target is Ann Arbor which is station __"KARB0"__, so pulling those features out. And we want to predict the weather 24 hours in the future, so need to duplicate and shift the features while doing some more basic cleaning 

In [8]:
ann_arbor_cols = [col for col in pivoted_df.columns if "KARB0" in col]
ann_arbor_df = pivoted_df[ann_arbor_cols].copy()
for col in ann_arbor_df.columns:
    ann_arbor_df[f'24 hr~{col}'] = ann_arbor_df[col].shift(-24)
ann_arbor_df = ann_arbor_df.rename_axis(None, axis = 0)
ann_arbor_df.head(5)

Unnamed: 0,temp_KARB0,dwpt_KARB0,rhum_KARB0,prcp_KARB0,wdir_KARB0,wspd_KARB0,pres_KARB0,24 hr~temp_KARB0,24 hr~dwpt_KARB0,24 hr~rhum_KARB0,24 hr~prcp_KARB0,24 hr~wdir_KARB0,24 hr~wspd_KARB0,24 hr~pres_KARB0
2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,11.0,1005.0,-2.2,-3.9,88.0,0.6,50.0,16.6,1010.0
2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4,-2.8,-5.6,81.0,0.7,30.0,16.6,1010.2
2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3,-3.3,-5.6,84.0,0.4,30.0,24.1,1009.9
2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9,-3.9,-6.1,85.0,0.3,20.0,18.4,1010.3
2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9,-4.4,-6.1,88.0,0.3,30.0,16.6,1010.1


### We need to merge the new features with the main dataframe so we have not only the Ann Arbor measurements, but also all measurements from surrounding stations.

In [29]:
pred_df = pd.merge(pivoted_df,ann_arbor_df, left_index=True, right_index=True)
pred_df = pred_df[pred_df['24 hr~temp_KARB0'].notna()]
print(pred_df.shape)
s=pred_df.isna().sum(axis=0).sort_values(ascending=False)
pd.cut(s, 10)


(6336, 4263)


prcp_6N2T2          (4599.9, 5111.0]
rhum_6N2T2          (4599.9, 5111.0]
dwpt_6N2T2          (4599.9, 5111.0]
wdir_6N2T2          (4599.9, 5111.0]
wspd_6N2T2          (4599.9, 5111.0]
                          ...       
rhum_KDRM0           (-5.111, 511.1]
rhum_KDTL0           (-5.111, 511.1]
rhum_KDUH0           (-5.111, 511.1]
rhum_KDVP0           (-5.111, 511.1]
24 hr~pres_KARB0     (-5.111, 511.1]
Length: 4263, dtype: category
Categories (10, interval[float64, right]): [(-5.111, 511.1] < (511.1, 1022.2] < (1022.2, 1533.3] < (1533.3, 2044.4] ... (3066.6, 3577.7] < (3577.7, 4088.8] < (4088.8, 4599.9] < (4599.9, 5111.0]]

### There are a lot of features with excessive amounts of null values to get rid of. Dropping any with more than 500 missing values still leaves a sufficient number of features for predicting

In [30]:
to_drop = []
for col in pred_df.columns:
    num = pred_df[col].isna().sum()
    if num > 500:
        # print(f"{col} has {num} missing values")
        to_drop.append(col)
pred_df.drop(columns=to_drop,inplace=True)
pred_df.dropna(inplace=True)
pred_df.shape

(4729, 4030)

### Now our target will be the '-24hr~temp_KARB0' column, and our features to use in our first prediction model will be all of the measurements at every surrounding station 24 hours prior to our target.
This cell will run 5 fold cross-validate on our 3 chosen regression models (Extra Trees Regressor, Lasso Regressor, and Tweedie Regressor). This will show how the average accuracy scores compare across these models on this data set.

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
X_cols = [col for col in pred_df.columns if "~" not in col]
X = pred_df[X_cols]
y = pred_df['24 hr~temp_KARB0']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=696)
xt_reg = ExtraTreesRegressor(random_state=696,n_jobs=-1)
lasso_reg = linear_model.Lasso(alpha=0.1,max_iter=1500)
tw_reg = linear_model.TweedieRegressor(max_iter=250)
dummy_reg = DummyRegressor(strategy="median")
models = {'Extra Trees Regressor':xt_reg,
          'Lasso Regressor':lasso_reg,
          'Tweedie Regressor':tw_reg,
          'Dummy Regressor':dummy_reg}
for key, value in models.items():
    value = make_pipeline(StandardScaler(), value)
    cv_results = cross_validate(value, X_train, y_train, cv=5,n_jobs=-1)
    print(key)
    print("Mean accuracy score: ", end="")
    print(round(cv_results['test_score'].mean(),3), end="")
    print(", best accuracy score: ", end="")
    print(round(cv_results['test_score'].max(),3), end="")
    print(", with std dev of: ", end="")
    print(round(cv_results['test_score'].std(),3))
    print("Mean training time: ", end="")
    print(round(cv_results['score_time'].mean(),3))
    print(f"Score on hold out set: {round(value.fit(X_train, y_train).score(X_test, y_test),3)}")
    print("**************")

Extra Trees Regressor
Mean accuracy score: 0.973, best accuracy score: 0.977, with std dev of: 0.002
Mean training time: 0.119
Score on hold out set: 0.982
**************
Lasso Regressor
Mean accuracy score: 0.942, best accuracy score: 0.944, with std dev of: 0.002
Mean training time: 0.05


  model = cd_fast.enet_coordinate_descent(


Score on hold out set: 0.95
**************
Tweedie Regressor
Mean accuracy score: 0.95, best accuracy score: 0.953, with std dev of: 0.003
Mean training time: 0.046
Score on hold out set: 0.959
**************
Dummy Regressor
Mean accuracy score: -0.071, best accuracy score: -0.031, with std dev of: 0.027
Mean training time: 0.049
Score on hold out set: -0.073
**************


### Examine the feature importances in the best performing model (Extra Trees Regressor)

In [35]:
feature_importance_df = pd.DataFrame([X.columns, xt_reg.feature_importances_]).transpose()
feature_importance_df.columns = ['Feature', 'Importance']
feature_importance_df.sort_values('Importance',ascending=False,inplace=True)
feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
371,temp_KISW0,0.04909
142,temp_72741,0.03691
486,temp_KRHI0,0.034794
269,temp_KCWA0,0.03135
405,temp_KMFI0,0.030644
392,temp_KLVN0,0.028024
128,temp_72643,0.027615
460,temp_KPCZ0,0.027418
271,temp_KD250,0.021453
260,temp_KCLI0,0.021165


In [36]:
import altair as alt
alt.Chart(feature_importance_df[:5]).mark_bar().encode(
    x=alt.X('Importance:Q', axis=alt.Axis(format="%", tickSize=0, labelFontSize=12)),
    y=alt.Y(
        'Feature:N', sort=list(feature_importance_df[:5].Feature), title="",
        axis=alt.Axis(tickSize=0, labelFontSize=12, labelPadding=10)),
).properties(
    height=200
)

### Hyper-parameter tuning the Extra Trees Regressor
5 fold cross validate looking for the optimized estimators, depth, sample split, and sample leaf parameters. Evaluating the 'best' based on the mean squared error achieved.

In [42]:
# %%timeit -r 1 -n 1
space  = [Integer(100,200, name='n_estimators'),
          Integer(1, 50, name='max_depth'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf')]

@use_named_args(space)
def objective(**params):
    xt_reg.set_params(**params)

    return -np.mean(cross_val_score(xt_reg, X_train, y_train, cv=5, n_jobs=-1))

res_gp = gp_minimize(objective, space, n_calls=15, random_state=696)

print(f"Best score: {res_gp.fun}")
print("Best parameters:")
print(f" - n-estimators= {res_gp.x[0]}")
print(f" - max_depth= {res_gp.x[1]}")
print(f" - min_samples_split= {res_gp.x[2]}")
print(f" - min_samples_leaf=  {res_gp.x[3]}")

Best score: -0.9731533162527466
Best parameters:
 - n-estimators= 101
 - max_depth= 50
 - min_samples_split= 2
 - min_samples_leaf=  1
9min 17s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Visualize the convergence for the above hypertuning

In [44]:
plot_convergence(res_gp)

NameError: name 'res_gp' is not defined

### Final re-run of the Extra Trees Model with the tuned hyperparameters
There is a very minor improvement on the accuracy score, with some pretty significant increases in training time.

In [45]:
xt_reg = ExtraTreesRegressor(n_estimators=res_gp.x[0], 
                             max_depth=res_gp.x[1], 
                             min_samples_split=res_gp.x[2], 
                             min_samples_leaf=res_gp.x[3], 
                             random_state=696,n_jobs=-1
                            )
models = {'Extra Trees Regressor':xt_reg}
for key, value in models.items():
    cv_results = cross_validate(value, X_train, y_train, cv=5,n_jobs=-1)
    print(key)
    print("Mean accuracy score: ", end="")
    print(round(cv_results['test_score'].mean(),3), end="")
    print(", best accuracy score: ", end="")
    print(round(cv_results['test_score'].max(),3), end="")
    print(", with std dev of: ", end="")
    print(round(cv_results['test_score'].std(),3))
    print("Mean training time: ", end="")
    print(round(cv_results['score_time'].mean(),3))
    print(f"Score on hold out set: {round(value.fit(X_train, y_train).score(X_test, y_test),3)}")
    print("**************")

Extra Trees Regressor
Mean accuracy score: 0.973, best accuracy score: 0.977, with std dev of: 0.002
Mean training time: 0.154
Score on hold out set: 0.982
**************
