# SVM

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR, LinearSVR

## Read dataset

In [2]:
data_train = pd.read_csv('trainset_w_lle.csv')
data_test = pd.read_csv('testset_w_lle.csv')
data_train.head()

Unnamed: 0,attendance,is_federal_holiday,venue,on_grass,temperature,wind_speed,team1_name,team1_pre_win,team1_pre_loss,team1_pre_win_pct,...,start_hour,start_hour_label_afternoon,start_hour_label_evening,start_hour_label_night,start_hour_label_noon,game_page_url,start_time,previous_5_to_10MA,lle1,lle2
0,35055,0,Wrigley Field,1,-2.786672,-0.09309,STL,-1.62645,-1.654514,0.0,...,19,0,1,0,0,https://www.baseball-reference.com/boxes/CHN/C...,2015-04-05 19:17:00,32742.135802,0.007683,0.000329
1,45030,0,Comerica Park,1,-2.031671,-0.489072,MIN,-1.62645,-1.654514,0.0,...,13,0,0,0,1,https://www.baseball-reference.com/boxes/DET/D...,2015-04-06 13:08:00,36014.925926,0.008566,0.006095
2,45909,0,Safeco Field,1,-1.842921,-1.281036,LAA,-1.62645,-1.654514,0.0,...,13,0,0,0,1,https://www.baseball-reference.com/boxes/SEA/S...,2015-04-06 13:12:00,25485.604938,0.003695,0.009393
3,53518,0,Dodger Stadium,1,-0.804795,-1.281036,SDP,-1.62645,-1.654514,0.0,...,13,0,0,0,1,https://www.baseball-reference.com/boxes/LAN/L...,2015-04-06 13:12:00,46695.518519,0.007517,0.00231
4,48469,0,Yankee Stadium III,1,-1.842921,-0.489072,TOR,-1.62645,-1.654514,0.0,...,13,0,0,0,1,https://www.baseball-reference.com/boxes/NYA/N...,2015-04-06 13:13:00,42520.3,0.006798,-0.005155


In [3]:
data_train.columns

Index(['attendance', 'is_federal_holiday', 'venue', 'on_grass', 'temperature',
       'wind_speed', 'team1_name', 'team1_pre_win', 'team1_pre_loss',
       'team1_pre_win_pct', 'team1_streak', 'team2_name', 'team2_pre_win',
       'team2_pre_loss', 'team2_pre_win_pct', 'team2_streak', 'salary-500-800',
       'salary-800-1500', 'salary-1500', 'day_Friday', 'day_Monday',
       'day_Saturday', 'day_Sunday', 'day_Thursday', 'day_Tuesday',
       'day_Wednesday', 'month_April', 'month_August', 'month_July',
       'month_June', 'month_March', 'month_May', 'month_November',
       'month_October', 'month_September', 'weather_Cloudy', 'weather_Drizzle',
       'weather_In Dome', 'weather_Overcast', 'weather_Rain', 'weather_Sunny',
       'season_type', 'season', 'home_team_avg_att_last_year', 'start_hour',
       'start_hour_label_afternoon', 'start_hour_label_evening',
       'start_hour_label_night', 'start_hour_label_noon', 'game_page_url',
       'start_time', 'previous_5_to_10MA', 'lle

In [4]:
from sklearn.preprocessing import StandardScaler
# standardize
number_col = ['previous_5_to_10MA']

# Standardize features by removing the mean and scaling to unit variance.
xscaler = StandardScaler().fit(data_train[number_col])
#standardize feature values
data_train[number_col] = xscaler.transform(data_train[number_col])
data_test[number_col] = xscaler.transform(data_test[number_col])

In [5]:
data_train.drop(['venue', 'start_hour', 'start_time', 'game_page_url'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour', 'start_time', 'game_page_url'], axis=1, inplace=True)

In [6]:
data_train = pd.get_dummies(data_train, columns=['team1_name', 'team2_name', 'season_type'])
data_test = pd.get_dummies(data_test, columns=['team1_name', 'team2_name', 'season_type'])

In [7]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

In [8]:
print(set(data_train.columns) - set(data_test.columns))
print(set(data_test.columns) - set(data_train.columns))

set()
set()


In [9]:
data_train.columns

Index(['attendance', 'is_federal_holiday', 'on_grass', 'temperature',
       'wind_speed', 'team1_pre_win', 'team1_pre_loss', 'team1_pre_win_pct',
       'team1_streak', 'team2_pre_win',
       ...
       'team2_name_SDP', 'team2_name_SEA', 'team2_name_SFG', 'team2_name_STL',
       'team2_name_TBR', 'team2_name_TEX', 'team2_name_TOR', 'team2_name_WSN',
       'season_type_post', 'season_type_regular'],
      dtype='object', length=109)

In [10]:
from sklearn.utils import shuffle
data_train = shuffle(data_train)

In [11]:
y_train = data_train['attendance'].to_numpy()
y_test = data_test['attendance'].to_numpy()

data_train.drop(['attendance'], axis=1, inplace=True)
data_test.drop(['attendance'], axis=1, inplace=True)

x_train = data_train.to_numpy()
x_test = data_test.to_numpy()

print(x_train.shape)

(12270, 108)


## kfold

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
scoring = ['neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error']
parameters = {'kernel' : ('linear', 'rbf', 'sigmoid'),'C' : [10, 100]}
# 'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [5, 10, 30, 50, 100],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')
model = SVR()
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring=scoring, refit='neg_root_mean_squared_error')
clf.fit(x_train, y_train)

In [14]:
print("Best estimators:", clf.best_estimator_)
print("Best parameters:", clf.best_params_)

Best estimators: SVR(C=100, kernel='linear')
Best parameters: {'C': 100, 'kernel': 'linear'}


In [15]:
df_res = pd.DataFrame(data=clf.cv_results_)
df_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_neg_mean_absolute_percentage_error,split1_test_neg_mean_absolute_percentage_error,split2_test_neg_mean_absolute_percentage_error,split3_test_neg_mean_absolute_percentage_error,split4_test_neg_mean_absolute_percentage_error,mean_test_neg_mean_absolute_percentage_error,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error
0,16.601182,1.066084,3.335805,0.191175,10,linear,"{'C': 10, 'kernel': 'linear'}",-29048060.0,-31645060.0,-28435990.0,...,52.900955,2,-0.158632,-0.16566,-0.168191,-0.16832,-0.160639,-0.164288,0.003967,2
1,18.492278,1.353573,10.443836,1.246033,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-105389100.0,-106916600.0,-109153000.0,...,147.700078,5,-0.383198,-0.391826,-0.413587,-0.425935,-0.393378,-0.401585,0.015733,5
2,20.273559,1.391635,5.162617,0.771104,10,sigmoid,"{'C': 10, 'kernel': 'sigmoid'}",-105390000.0,-106917600.0,-109153900.0,...,147.700866,6,-0.3832,-0.391827,-0.413588,-0.425937,-0.393379,-0.401586,0.015733,6
3,19.083862,0.651374,3.888237,0.22708,100,linear,"{'C': 100, 'kernel': 'linear'}",-27040300.0,-29454120.0,-26450550.0,...,59.052084,1,-0.150293,-0.157791,-0.157468,-0.158323,-0.151605,-0.155096,0.003422,1
4,20.260055,1.015853,11.50592,1.256951,100,rbf,"{'C': 100, 'kernel': 'rbf'}",-105378500.0,-106906100.0,-109142200.0,...,147.691208,3,-0.383184,-0.391809,-0.41357,-0.425918,-0.393356,-0.401567,0.015733,3


## Train with best params

In [16]:
svm = SVR(kernel='linear', C=100)
svm = svm.fit(x_train, y_train)

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
train_predict = svm.predict(x_train)
print("Train MSE:", mean_squared_error(y_train, train_predict))
print("Train RMSE:", mean_squared_error(y_train, train_predict, squared=False))
print("Train MAE:", mean_absolute_error(y_train, train_predict))
print("Train MAPE:", mean_absolute_percentage_error(y_train, train_predict))

Train MSE: 27358230.630252305
Train RMSE: 5230.509595656269
Train MAE: 3912.7696164768167
Train MAPE: 0.1529642073171672


In [20]:
# print top 10 features with the highest coefficients
coef = pd.Series(svm.coef_[0], index = data_train.columns)
print("top 10 features with the highest coefficients:\n", coef.sort_values(ascending=False)[:10])

top 10 features with the highest coefficients:
 day_Saturday                   4631.137542
previous_5_to_10MA             3883.169283
home_team_avg_att_last_year    2984.492213
team1_name_CHC                 2795.861546
team1_name_NYY                 2600.000000
team2_pre_win                  2175.519903
team1_name_LAD                 2130.073310
day_Friday                     2095.701465
team2_name_LAD                 2073.679533
team2_name_LAA                 2055.681456
dtype: float64


In [21]:
# also print the top 10 features with the lowest coefficients
print("top 10 features with the lowest coefficients:\n", coef.sort_values(ascending=False)[-10:])

top 10 features with the lowest coefficients:
 team2_name_PIT        -1734.674188
season_type_regular   -1853.855051
team2_name_BAL        -1935.163302
day_Wednesday         -2009.660221
team1_pre_win_pct     -2010.727581
month_September       -2036.526928
team2_name_MIA        -2072.689347
team2_name_OAK        -2100.000000
day_Tuesday           -2387.366766
day_Monday            -2641.585925
dtype: float64


In [22]:
# print top 10 features with the highest coefficients with absolute value
print("top 10 features with the highest coefficients with absolute value:\n", coef.abs().sort_values(ascending=False)[:10])

top 10 features with the highest coefficients with absolute value:
 day_Saturday                   4631.137542
previous_5_to_10MA             3883.169283
home_team_avg_att_last_year    2984.492213
team1_name_CHC                 2795.861546
day_Monday                     2641.585925
team1_name_NYY                 2600.000000
day_Tuesday                    2387.366766
team2_pre_win                  2175.519903
team1_name_LAD                 2130.073310
team2_name_OAK                 2100.000000
dtype: float64


## Testing set

In [23]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
y_pred = svm.predict(x_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))
print("Test RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_pred))

Test MSE: 42543736.29166186
Test RMSE: 6522.555963091605
Test MAE: 5027.778102072924
Test MAPE: 0.23650583636970252


## Save model

In [24]:
import pickle
from datetime import datetime
time = datetime.today().strftime('%Y%m%d_%H%M%S')
filename = f'./model/svm_model_{time}.sav'
pickle.dump(svm, open(filename, 'wb'))
cv_filename = f'./model/svm_model_{time}_cv.csv'
df_res.to_csv(cv_filename, index=False)
# loaded_model = pickle.load(open(filename, 'rb'))