## SVM Classifier

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings


In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



### GridSearchCV

In [5]:
# Support vector classifier
model = SVC()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)




In [6]:
preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 622.9696426391602 seconds
              precision    recall  f1-score   support

   home_loss       0.62      0.45      0.52      1935
    home_win       0.67      0.79      0.72      2648

    accuracy                           0.65      4583
   macro avg       0.64      0.62      0.62      4583
weighted avg       0.65      0.65      0.64      4583

val score: 0.6685950413223141
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6605797101449274
test score 0.6513200960069824


### SVM with RFECV

In [5]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [6]:
# Support vector classifier
model = SVC()
model.fit(X_train_rcv, y_train_rcv)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1)

In [7]:
start_time = time.time()
grid.fit(X_train_rcv, y_train_rcv)

preds = grid.predict(X_test_rcv)
test_score = grid.score(X_test_rcv, y_test_rcv)
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))

model = grid.best_estimator_
y_fit = model.predict(X_test_rcv)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 373.44867300987244 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.47      0.54      1935
    home_win       0.67      0.81      0.74      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.64      0.64      4583
weighted avg       0.66      0.67      0.65      4583

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6858369098712447
test score 0.6652847479816714


### SVM with Univariate Feature Selection

In [17]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train


X_test_uni = test_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_test_uni = y_test


In [18]:
svc = SVC()
svc.fit(X_train_uni, y_train_uni)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)

In [19]:
start_time = time.time()
grid.fit(X_train_uni, y_train_uni)

preds = grid.predict(X_test_uni)
test_score = grid.score(X_test_uni, y_test_uni)

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_uni, preds, target_names=target_names))

model = grid.best_estimator_
y_fit = model.predict(X_test_uni)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 271.92281222343445 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.50      0.56      1935
    home_win       0.68      0.79      0.73      2648

    accuracy                           0.66      4583
   macro avg       0.66      0.64      0.64      4583
weighted avg       0.66      0.66      0.66      4583

val score: 0.7004132231404959
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6857971014492754
test score 0.6637573641719398


### SVM with SelectFromModel(LassoCV)

In [20]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [21]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('classification', SVC(C=1, gamma=0.0001, kernel='rbf'))
])
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LassoCV())),
                ('classification', SVC(C=1, gamma=0.0001))])

In [22]:
# Predict
preds = pipe_model.predict(X_test)
test_score = pipe_model.score(X_test, y_test)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 558.7029147148132 seconds
              precision    recall  f1-score   support

   home loss       0.61      0.46      0.53      1935
    home win       0.67      0.79      0.72      2648

    accuracy                           0.65      4583
   macro avg       0.64      0.63      0.63      4583
weighted avg       0.65      0.65      0.64      4583

test score 0.6506655029456688


#### Use lasso_data

In [23]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [24]:
svc = SVC()
svc.fit(X_train_lasso, y_train_lasso)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)

In [25]:
start_time = time.time()
grid.fit(X_train_lasso, y_train_lasso)

preds = grid.predict(X_test_lasso)
test_score = grid.score(X_test_lasso, y_test_lasso)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))

model = grid.best_estimator_
y_fit = model.predict(X_test_lasso)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 190.10071921348572 seconds
              precision    recall  f1-score   support

   home loss       0.61      0.46      0.53      1935
    home win       0.67      0.79      0.72      2648

    accuracy                           0.65      4583
   macro avg       0.64      0.62      0.62      4583
weighted avg       0.64      0.65      0.64      4583

val score: 0.684297520661157
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6702898550724637
test score 0.6489199214488326
