## SVM Classifier

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings


In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



### GridSearchCV

In [5]:
# Support vector classifier
model = SVC()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_val)
test_score = grid.score(X_val, y_val)



In [6]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 607.7583913803101 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.45      0.53      1011
    home_win       0.68      0.82      0.74      1409

    accuracy                           0.67      2420
   macro avg       0.66      0.64      0.64      2420
weighted avg       0.67      0.67      0.66      2420

val score: 0.6685950413223141
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6605797101449274
test score 0.6685950413223141


### RandomizedSearchCV

In [7]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
              }

clf = RandomizedSearchCV(
    SVC(kernel="rbf", class_weight="balanced"), param_grid, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)

In [8]:
start_time = time.time()
clf = clf.fit(X_train, y_train)
val_score = clf.score(X_val, y_val)

preds = clf.predict(X_val)
test_score = clf.score(X_val, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

y_fit = clf.predict(X_val)

print("test score", test_score)

Συνολικός χρόνος fit και predict: 239.0733585357666 seconds
              precision    recall  f1-score   support

   home loss       0.58      0.64      0.61      1011
    home win       0.72      0.66      0.69      1409

    accuracy                           0.65      2420
   macro avg       0.65      0.65      0.65      2420
weighted avg       0.66      0.65      0.66      2420

val score: 0.6545454545454545
test score 0.6545454545454545


In [9]:
print(clf.best_params_)
print("best score:", clf.best_score_)

{'gamma': 0.0001, 'C': 1}
best score: 0.6456521739130435


### SVM with Recursive Feature Elimination (RFE)

In [10]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [11]:
X_train_rfe = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_train_rfe = y_train

X_val_rfe = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_val_rfe = y_val

In [12]:
svc = SVC()
svc.fit(X_train_rfe, y_train_rfe)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [13]:
start_time = time.time()
grid.fit(X_train_rfe, y_train_rfe)
val_score = grid.score(X_val_rfe, y_val_rfe)

preds = grid.predict(X_val_rfe)
test_score = grid.score(X_val_rfe, y_val_rfe)

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rfe, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 321.3501079082489 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.48      0.55      1011
    home_win       0.69      0.82      0.75      1409

    accuracy                           0.68      2420
   macro avg       0.67      0.65      0.65      2420
weighted avg       0.67      0.68      0.67      2420

val score: 0.6776859504132231
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6643478260869565
test score 0.6776859504132231


### SVM with RFECV

In [50]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [51]:
# Support vector classifier
model = SVC()
model.fit(X_train_rcv, y_train_rcv)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [52]:
start_time = time.time()
grid.fit(X_train_rcv, y_train_rcv)
val_score = grid.score(X_val_rcv, y_val_rcv)

preds = grid.predict(X_val_rcv)
test_score = grid.score(X_val_rcv, y_val_rcv)
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rcv)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 162.3340916633606 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.47      0.55      1011
    home_win       0.69      0.84      0.75      1409

    accuracy                           0.68      2420
   macro avg       0.68      0.65      0.65      2420
weighted avg       0.68      0.68      0.67      2420

val score: 0.6830578512396694
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6852173913043479
test score 0.6830578512396694


### SVM with Univariate Feature Selection

In [16]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [17]:
svc = SVC()
svc.fit(X_train_uni, y_train_uni)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [18]:
start_time = time.time()
grid.fit(X_train_uni, y_train_uni)
val_score = grid.score(X_val_uni, y_val_uni)

preds = grid.predict(X_val_uni)
test_score = grid.score(X_val_uni, y_val_uni)

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_uni)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 279.16617608070374 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.54      0.60      1011
    home_win       0.71      0.82      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.7004132231404959
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6857971014492754
test score 0.7004132231404959


### SVM with ExtraTreesClassifier

In [19]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [29]:
svc = SVC()
svc.fit(X_train_extra, y_train_extra)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [30]:
start_time = time.time()
grid.fit(X_train_extra, y_train_extra)
val_score = grid.score(X_val_extra, y_val_extra)

preds = grid.predict(X_val_extra)
test_score = grid.score(X_val_extra, y_val_extra)

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_extra)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 221.49125576019287 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.48      0.55      1011
    home_win       0.69      0.82      0.75      1409

    accuracy                           0.68      2420
   macro avg       0.67      0.65      0.65      2420
weighted avg       0.67      0.68      0.67      2420

val score: 0.6764462809917355
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6627536231884058
test score 0.6764462809917355


### SVM with PCA

In [31]:
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [32]:
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [33]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

#### GridSearch

In [34]:
svc = SVC()
svc.fit(X_train_pca, y_train)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [35]:
start_time = time.time()
grid.fit(X_train_pca, y_train)
val_score = grid.score(X_val_pca, y_val)

preds = grid.predict(X_val_pca)
test_score = grid.score(X_val_pca, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_pca)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 104.88466715812683 seconds
              precision    recall  f1-score   support

   home loss       0.61      0.67      0.64      1011
    home win       0.75      0.69      0.72      1409

    accuracy                           0.68      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.68      0.68      2420

val score: 0.6822314049586777
{'gamma': 0.0001, 'C': 10}
best score: 0.6707246376811595
test score 0.6822314049586777


#### Randomized Search

In [27]:
# Train a SVM classification model

print("Fitting the classifier to the training set")

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
              }

clf = RandomizedSearchCV(
    SVC(kernel="rbf", class_weight="balanced"), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    


Fitting the classifier to the training set


In [28]:
start_time = time.time()
clf = clf.fit(X_train_pca, y_train)
val_score = clf.score(X_val_pca, y_val)

preds = clf.predict(X_val_pca)
test_score = clf.score(X_val_pca, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

y_fit = clf.predict(X_val_pca)

print(clf.best_params_)
print("best score:", clf.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 79.63408780097961 seconds
              precision    recall  f1-score   support

   home loss       0.61      0.67      0.64      1011
    home win       0.74      0.69      0.72      1409

    accuracy                           0.68      2420
   macro avg       0.67      0.68      0.68      2420
weighted avg       0.69      0.68      0.68      2420

val score: 0.6805785123966942
{'gamma': 0.0001, 'C': 1}
best score: 0.67
test score 0.6805785123966942


### SVM with SelectFromModel(LassoCV)

In [36]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [37]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('classification', SVC(C=1, gamma=0.0001, kernel='rbf'))
])
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LassoCV())),
                ('classification', SVC(C=1, gamma=0.0001))])

In [38]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 196.7856888771057 seconds
              precision    recall  f1-score   support

   home loss       0.72      0.43      0.54      1011
    home win       0.68      0.88      0.77      1409

    accuracy                           0.69      2420
   macro avg       0.70      0.66      0.66      2420
weighted avg       0.70      0.69      0.68      2420

test score 0.6942148760330579


#### Use lasso_data

In [39]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

In [40]:
svc = SVC()
svc.fit(X_train_lasso, y_train_lasso)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [41]:
start_time = time.time()
grid.fit(X_train_lasso, y_train_lasso)
val_score = grid.score(X_val_lasso, y_val_lasso)

preds = grid.predict(X_val_lasso)
test_score = grid.score(X_val_lasso, y_val_lasso)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_lasso, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_lasso)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 76.47707533836365 seconds
              precision    recall  f1-score   support

   home loss       0.52      0.37      0.43      1011
    home win       0.62      0.75      0.68      1409

    accuracy                           0.59      2420
   macro avg       0.57      0.56      0.56      2420
weighted avg       0.58      0.59      0.58      2420

val score: 0.5913223140495868
{'gamma': 0.001, 'C': 100}
best score: 0.6065217391304348
test score 0.5913223140495868


### SVM with Sequential Feature Selection

#### Forward

In [42]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [43]:
svc = SVC()
svc.fit(X_train_for_sfs, y_train_for_sfs)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [46]:
start_time = time.time()
grid.fit(X_train_for_sfs, y_train_for_sfs)
val_score = grid.score(X_val_for_sfs, y_val_for_sfs)

preds = grid.predict(X_val_for_sfs)
test_score = grid.score(X_val_for_sfs, y_val_for_sfs)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_for_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 290.5786244869232 seconds
              precision    recall  f1-score   support

   home loss       0.65      0.46      0.54      1011
    home win       0.68      0.82      0.74      1409

    accuracy                           0.67      2420
   macro avg       0.66      0.64      0.64      2420
weighted avg       0.67      0.67      0.66      2420

val score: 0.6702479338842975
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6676811594202898
test score 0.6702479338842975


#### Backwards

In [47]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [48]:
svc = SVC()
svc.fit(X_train_back_sfs, y_train_back_sfs)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [49]:
start_time = time.time()
grid.fit(X_train_back_sfs, y_train_back_sfs)
val_score = grid.score(X_val_back_sfs, y_val_back_sfs)

preds = grid.predict(X_val_back_sfs)
test_score = grid.score(X_val_back_sfs, y_val_back_sfs)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_back_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 364.98614287376404 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.51      0.57      1011
    home win       0.70      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.66      0.66      2420
weighted avg       0.68      0.69      0.68      2420

val score: 0.6859504132231405
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6681159420289855
test score 0.6859504132231405
