In [1]:
import pandas as pd
from sklearn.metrics import root_mean_squared_error
import joblib

In [2]:
X_train = pd.read_parquet('../data/output/X_train.parquet')
y_train = pd.read_parquet('../data/output/y_train.parquet')

pipeline = joblib.load('../output/model/baseline_linear_model.joblib')

In [3]:
y_train_pred = pipeline.predict(X_train)
rmse_train = root_mean_squared_error(y_train, y_train_pred)

print(f"Training RMSE: {rmse_train}")

Training RMSE: 3.3135078225075048


In [4]:
df_results = X_train[['event_id', 'price_code', 'opponent', 'event_date', 'calculate_date']].copy()
df_results['y_true'] = y_train
df_results['y_pred'] = y_train_pred
df_results.head(50)

Unnamed: 0,event_id,price_code,opponent,event_date,calculate_date,y_true,y_pred
0,11005B3CED352DAF,A,Ducks,2022-02-19,2022-01-20,-3.0,-9.359383
1,11005B3CED352DAF,J,Ducks,2022-02-19,2022-01-20,4.0,-6.374559
2,11005B3CED352DAF,K,Ducks,2022-02-19,2022-01-20,12.0,-8.986852
3,11005B3CED352DAF,O,Ducks,2022-02-19,2022-01-20,-3.0,-0.746644
4,11005B3CED352DAF,0,Ducks,2022-02-19,2022-01-20,0.0,-1.377481
5,11005B3CED352DAF,V,Ducks,2022-02-19,2022-01-20,0.0,-0.364522
6,11005B3CED352DAF,6,Ducks,2022-02-19,2022-01-20,0.0,-5.705637
7,11005B3CED352DAF,8,Ducks,2022-02-19,2022-01-20,19.0,-15.143325
8,11005B3CED352DAF,M,Ducks,2022-02-19,2022-01-20,-1.0,-3.403435
9,11005B3CED352DAF,2,Ducks,2022-02-19,2022-01-20,-4.0,-13.317364


In [5]:
# Get the feature names from the column transformer
ct = pipeline.named_steps['preprocessor']
feature_names = ct.get_feature_names_out()

# Get the coefficients from the Ridge model
coefficients = pipeline.named_steps['model'].coef_.ravel()

# Create a DataFrame with feature names and coefficients
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Display the DataFrame sorted by absolute value of Coefficient
feature_importances.reindex(feature_importances.Coefficient.sort_values(ascending=False).index)

Unnamed: 0,Feature,Coefficient
10,standardscaler__prp_forwardtix,1.914254
7,standardscaler__pcp_forwardtix,1.586373
20,standardscaler__pio_forwardrev,1.058774
12,standardscaler__prg_currentprice,0.925972
17,standardscaler__pmt_forwardrev,0.808934
...,...,...
18,standardscaler__pio_currentprice,-0.922560
11,standardscaler__prp_forwardrev,-1.212185
19,standardscaler__pio_forwardtix,-1.243604
8,standardscaler__pcp_forwardrev,-1.325960


In [6]:
opponent_features = feature_importances[feature_importances['Feature'].str.startswith('onehotencoder__opponent_')]

# Display the DataFrame sorted by Importance
opponent_features.sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
100,onehotencoder__opponent_Leafs,0.403595
102,onehotencoder__opponent_Oilers,0.314949
93,onehotencoder__opponent_Ducks,0.232301
111,onehotencoder__opponent_Stars,0.212729
97,onehotencoder__opponent_Islanders,0.204895
99,onehotencoder__opponent_Kings,0.130145
90,onehotencoder__opponent_Capitals,0.117241
107,onehotencoder__opponent_Sabres,0.103512
94,onehotencoder__opponent_Flames,0.096417
87,onehotencoder__opponent_Blues,0.085554
