# Ridge

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge

In [2]:
data_train = pd.read_csv("processed_training.csv")
data_test = pd.read_csv("processed_test.csv")

### Data Preparation

In [3]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

data_train.drop(['venue', 'start_hour'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour'], axis=1, inplace=True)

data_train = pd.get_dummies(data_train, columns=['team1_name', 'team2_name', 'season_type'])
data_test = pd.get_dummies(data_test, columns=['team1_name', 'team2_name', 'season_type'])

# ensure the train and test data have the same columns

train_cols = set(data_train.columns)
test_cols = set(data_test.columns)

print(train_cols - test_cols)
print(test_cols - train_cols)

from sklearn.utils import shuffle

data_train = shuffle(data_train)

Y_train = data_train['attendance'].to_numpy()
X_train = data_train.drop(['attendance'], axis=1).to_numpy()

Y_test = data_test['attendance'].to_numpy()
X_test = data_test.drop(['attendance'], axis=1).to_numpy()

# print shape of data
print("X_train: ", X_train.shape)
print("Y_train", Y_train.shape)
print("X_test", X_test.shape)
print("Y_test", Y_test.shape)

set()
set()
X_train:  (12270, 273)
Y_train (12270,)
X_test (2445, 273)
Y_test (2445,)


### KFold Cross Validation (with 5 folds)

In [4]:
# use 5-folds cross validation to find the best alpha
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X_train)

5

In [5]:
# use grid search to find the best alpha
alphas = np.logspace(-3, 3, 20)
print(alphas)

[1.00000000e-03 2.06913808e-03 4.28133240e-03 8.85866790e-03
 1.83298071e-02 3.79269019e-02 7.84759970e-02 1.62377674e-01
 3.35981829e-01 6.95192796e-01 1.43844989e+00 2.97635144e+00
 6.15848211e+00 1.27427499e+01 2.63665090e+01 5.45559478e+01
 1.12883789e+02 2.33572147e+02 4.83293024e+02 1.00000000e+03]


In [6]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas), cv=kf, scoring='neg_root_mean_squared_error')
grid.fit(X_train, Y_train)

print("best alpha: ", grid.best_estimator_.alpha)


best alpha:  2.976351441631316


#### Train the model with the best alpha

In [7]:
best_alpha = grid.best_estimator_.alpha

# train the model with the best alpha
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, Y_train)

Ridge(alpha=2.976351441631316)

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

train_predict = best_model.predict(X_train)
print("Train MSE: ", mean_squared_error(Y_train, train_predict))
print("Train RMSE: ", np.sqrt(mean_squared_error(Y_train, train_predict)))
print("Train MAE: ", mean_absolute_error(Y_train, train_predict))
print("Train MAPE: ", mean_absolute_percentage_error(Y_train, train_predict))

Train MSE:  26453178.283040363
Train RMSE:  5143.265332747316
Train MAE:  3986.715834617138
Train MAPE:  0.1605140671248119


In [9]:
# print top 10 features with the highest coefficients
coef = pd.Series(best_model.coef_, index = data_train.drop(['attendance'], axis=1).columns)
print("top 10 features with the highest coefficients:\n", coef.sort_values(ascending=False)[:10])

top 10 features with the highest coefficients:
 team2_name_LAD                 7248.621969
team2_name_STL                 5542.191659
team2_name_LAA                 5473.253064
day_Saturday                   4799.031389
team2_name_SFG                 4683.151973
team1_name_BOS                 4233.115932
team2_name_NYY                 4228.676330
home_team_avg_att_last_year    3899.217351
team1_name_CHC                 3814.660223
team1_name_NYY                 3713.577660
dtype: float64


In [10]:
# also print the top 10 features with the lowest coefficients
print("top 10 features with the lowest coefficients:\n", coef.sort_values(ascending=False)[-10:])

top 10 features with the lowest coefficients:
 team2_name_BAL      -3200.742975
team1_name_COL      -3370.459045
team2_name_OAK      -3423.039754
team2_name_PIT      -4498.787818
team2_pre_loss      -4831.469553
team2_name_MIA      -5096.218400
team2_pre_win_pct   -5476.854859
team2_name_CLE      -6421.014448
team2_name_TBR      -6670.370595
team1_pre_win_pct   -7367.872378
dtype: float64


In [11]:
# print top 10 features with the highest coefficients with absolute value
print("top 10 features with the highest coefficients with absolute value:\n", coef.abs().sort_values(ascending=False)[:10])

top 10 features with the highest coefficients with absolute value:
 team1_pre_win_pct    7367.872378
team2_name_LAD       7248.621969
team2_name_TBR       6670.370595
team2_name_CLE       6421.014448
team2_name_STL       5542.191659
team2_pre_win_pct    5476.854859
team2_name_LAA       5473.253064
team2_name_MIA       5096.218400
team2_pre_loss       4831.469553
day_Saturday         4799.031389
dtype: float64


#### test on test data

In [12]:
test_predict = best_model.predict(X_test)
print("Test MSE: ", mean_squared_error(Y_test, test_predict))
print("Test RMSE: ", np.sqrt(mean_squared_error(Y_test, test_predict)))
print("Test MAE: ", mean_absolute_error(Y_test, test_predict))
print("Test MAPE: ", mean_absolute_percentage_error(Y_test, test_predict))

Test MSE:  41186911.19626421
Test RMSE:  6417.702953258604
Test MAE:  4965.704296100361
Test MAPE:  0.22846001764434362
