## SVM with GridSearchCV

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings


In [3]:
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [5]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [7]:
# Support vector classifier
model = SVC()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)



In [11]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 931.3766624927521 seconds
              precision    recall  f1-score   support

   home_loss       0.73      0.68      0.70      2274
    home_win       0.79      0.82      0.81      3288

    accuracy                           0.76      5562
   macro avg       0.76      0.75      0.75      5562
weighted avg       0.76      0.76      0.76      5562

val score: 0.7771120431396046
{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.7549466839613901
test score 0.7642934196332255


## SVM with Recursive Feature Elimination (RFE)

In [21]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [22]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [23]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [38]:
svc = SVC(C=10, gamma=0.0001, kernel='rbf')

In [39]:
class MyPipeline(Pipeline):
    @property
    def coef_(self):
        return self._final_estimator.coef_
    @property
    def feature_importances_(self):
        return self._final_estimator.feature_importances_

In [41]:
pipe_model = MyPipeline([
    ('scaler', StandardScaler()),
    ('selector', RFECV(svc)),
    ('estimator', svc)
])

In [43]:
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(svc, step=1, cv=5, scoring='accuracy', min_features_to_select=min_features_to_select)

rfecv.fit(X_train, y_train)


ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [26]:
# Support Vector Machine
svc = SVC(feature_importances_)
svc.fit(X_train, y_train)

NameError: name 'feature_importances_' is not defined

In [25]:
rfe = RFE(svc)
rfe.fit(X_train, y_train)
rfe.get_support()

ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [24]:
# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring shows the proportion of correct classifications

min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(
    estimator=svc,
    step=1,
    cv=StratifiedKFold(5),
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
)
rfecv.fit(X_train, y_train)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected Features: %s" % rfecv.support_)

In [13]:
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__kernel': ['linear', 'rbf'],
              'svc__C': [0.1, 1, 10, 100],
              'svc__gamma': [0.00025, 0.0005, 0.001]}
grid = GridSearchCV(rfecv, param_grid, cv=5)

%time grid.fit(X_train, y_train)
print(grid.best_params_)

ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [27]:
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

In [28]:
start_time = time.time()

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home_loss       1.00      1.00      1.00      2274
    home_win       1.00      1.00      1.00      3288

    accuracy                           1.00      5562
   macro avg       1.00      1.00      1.00      5562
weighted avg       1.00      1.00      1.00      5562

val score: 1.0
{'pca__n_components': 20, 'svc__C': 0.1, 'svc__gamma': 0.00025, 'svc__kernel': 'linear'}
best score: 1.0
test score 0.9998202085580726


## SVM with PCA

In [18]:
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

### Randomized Search

In [21]:
# Train a SVM classification model

print("Fitting the classifier to the training set")

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
              }

clf = RandomizedSearchCV(
    SVC(kernel="rbf", class_weight="balanced"), param_grid, n_iter=10)
    


Fitting the classifier to the training set


In [22]:
start_time = time.time()
clf = clf.fit(X_train_pca, y_train)
val_score = clf.score(X_val_pca, y_val)

preds = clf.predict(X_test_pca)
test_score = clf.score(X_test_pca, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

y_fit = clf.predict(X_test_pca)

print("test score", test_score)


Συνολικός χρόνος fit και predict: 262.88806414604187 seconds
              precision    recall  f1-score   support

   home loss       0.58      0.70      0.63      2312
    home win       0.75      0.63      0.69      3250

    accuracy                           0.66      5562
   macro avg       0.66      0.67      0.66      5562
weighted avg       0.68      0.66      0.66      5562

val score: 0.42061114439784303
test score 0.660913340524991


In [23]:
print(clf.best_params_)
print("best score:", clf.best_score_)
# print("test score", test_score)

{'gamma': 0.001, 'C': 10}
best score: 0.6687679755627804


### GridSearchCV

In [25]:
# Train a SVM classification model

print("Fitting the classifier to the training set")

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
              }

clf = GridSearchCV(
    SVC(kernel="rbf", class_weight="balanced"), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    


Fitting the classifier to the training set


In [26]:
start_time = time.time()
clf = clf.fit(X_train_pca, y_train)
val_score = clf.score(X_val_pca, y_val)

preds = clf.predict(X_test_pca)
test_score = clf.score(X_test_pca, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

y_fit = clf.predict(X_test_pca)

print("test score", test_score)

Συνολικός χρόνος fit και predict: 210.2338945865631 seconds
              precision    recall  f1-score   support

   home loss       0.58      0.69      0.63      2312
    home win       0.75      0.65      0.69      3250

    accuracy                           0.67      5562
   macro avg       0.66      0.67      0.66      5562
weighted avg       0.68      0.67      0.67      5562

val score: 0.42061114439784303
test score 0.6652283351312478


In [27]:
print(clf.best_params_)
print("best score:", clf.best_score_)

{'C': 0.1, 'gamma': 0.01}
best score: 0.6726655773236976
