# Lasso

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso

In [2]:
data_train = pd.read_csv("processed_training.csv")
data_test = pd.read_csv("processed_test.csv")

### Data Preparation

##### ensure the column orders are the same in train and test

In [3]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

##### drop unnecessary columns, these columns won't be useful in analysis and prediction

In [4]:
data_train.drop(['venue', 'start_hour'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour'], axis=1, inplace=True)

##### get dummies for team names and season type

In [5]:
data_train = pd.get_dummies(data_train, columns=['team1_name', 'team2_name', 'season_type'])
data_test = pd.get_dummies(data_test, columns=['team1_name', 'team2_name', 'season_type'])

In [6]:
# ensure the train and test data have the same columns

train_cols = set(data_train.columns)
test_cols = set(data_test.columns)

print(train_cols - test_cols)
print(test_cols - train_cols)

set()
set()


##### shuffle the training data

In [7]:
from sklearn.utils import shuffle

data_train = shuffle(data_train)

##### prepare numpy arrays for training and testing

In [9]:
Y_train = data_train['attendance'].to_numpy()
X_train = data_train.drop(['attendance'], axis=1).to_numpy()

Y_test = data_test['attendance'].to_numpy()
X_test = data_test.drop(['attendance'], axis=1).to_numpy()

# print shape of data
print("X_train: ", X_train.shape)
print("Y_train", Y_train.shape)
print("X_test", X_test.shape)
print("Y_test", Y_test.shape)

X_train:  (12270, 273)
Y_train (12270,)
X_test (2445, 273)
Y_test (2445,)


### KFold Cross Validation (with 5 folds)

In [15]:
# use 5-folds cross validation to find the best alpha
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X_train)

5

In [16]:
# use grid search to find the best alpha
alphas = np.logspace(-3, 3, 20)
print(alphas)

[1.00000000e-03 2.06913808e-03 4.28133240e-03 8.85866790e-03
 1.83298071e-02 3.79269019e-02 7.84759970e-02 1.62377674e-01
 3.35981829e-01 6.95192796e-01 1.43844989e+00 2.97635144e+00
 6.15848211e+00 1.27427499e+01 2.63665090e+01 5.45559478e+01
 1.12883789e+02 2.33572147e+02 4.83293024e+02 1.00000000e+03]


In [17]:
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas), cv=kf, scoring='neg_root_mean_squared_error')
grid.fit(X_train, Y_train)

print("best alpha: ", grid.best_estimator_.alpha)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

best alpha:  1.438449888287663


#### Train the model with the best alpha

In [18]:
best_alpha = grid.best_estimator_.alpha

# train the model with the best alpha
best_model = Lasso(alpha=best_alpha)
best_model.fit(X_train, Y_train)

Lasso(alpha=1.438449888287663)

In [33]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

train_predict = best_model.predict(X_train)
print("Train MSE: ", mean_squared_error(Y_train, train_predict))
print("Train RMSE: ", np.sqrt(mean_squared_error(Y_train, train_predict)))
print("Train MAE: ", mean_absolute_error(Y_train, train_predict))
print("Train MAPE: ", mean_absolute_percentage_error(Y_train, train_predict))

Train MSE:  26513520.457153033
Train RMSE:  5149.128125921226
Train MAE:  3991.115002779432
Train MAPE:  0.16083612709503323


In [41]:
# print top 10 features with the highest coefficients
coef = pd.Series(best_model.coef_, index = data_train.drop(['attendance'], axis=1).columns)
print("top 10 features with the highest coefficients:\n", coef.sort_values(ascending=False)[:10])

top 10 features with the highest coefficients:
 team2_name_LAD                 7305.482297
day_Saturday                   6431.053088
team2_name_STL                 5629.571741
team2_name_LAA                 5453.787896
season_type_post               5371.029199
team2_name_SFG                 4520.970190
team2_name_NYY                 4327.694048
day_Friday                     4034.774946
home_team_avg_att_last_year    3945.660824
team1_name_BOS                 3735.893541
dtype: float64


In [42]:
# also print the top 10 features with the lowest coefficients
print("top 10 features with the lowest coefficients:\n", coef.sort_values(ascending=False)[-10:])

top 10 features with the lowest coefficients:
 team1_name_COL      -2801.880070
team1_name_TOR      -2858.161705
team2_name_OAK      -3123.084038
team2_name_PIT      -3931.846731
team2_pre_loss      -4760.459016
team2_name_MIA      -4885.717895
team2_pre_win_pct   -5431.893576
team2_name_CLE      -5612.353136
team2_name_TBR      -6081.879244
team1_pre_win_pct   -7384.390911
dtype: float64


In [43]:
# print top 10 features with the highest coefficients with absolute value
print("top 10 features with the highest coefficients with absolute value:\n", coef.abs().sort_values(ascending=False)[:10])

top 10 features with the highest coefficients with absolute value:
 team1_pre_win_pct    7384.390911
team2_name_LAD       7305.482297
day_Saturday         6431.053088
team2_name_TBR       6081.879244
team2_name_STL       5629.571741
team2_name_CLE       5612.353136
team2_name_LAA       5453.787896
team2_pre_win_pct    5431.893576
season_type_post     5371.029199
team2_name_MIA       4885.717895
dtype: float64


#### test on test data

In [44]:
test_predict = best_model.predict(X_test)
print("Test MSE: ", mean_squared_error(Y_test, test_predict))
print("Test RMSE: ", np.sqrt(mean_squared_error(Y_test, test_predict)))
print("Test MAE: ", mean_absolute_error(Y_test, test_predict))
print("Test MAPE: ", mean_absolute_percentage_error(Y_test, test_predict))

Test MSE:  40177567.7410704
Test RMSE:  6338.577738031648
Test MAE:  4907.864543480457
Test MAPE:  0.22711703126517055
