## SVM with GridSearchCV

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings


In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('data5.csv')

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [9]:
# Support vector classifier
model = SVC()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)



In [10]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names-target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 610.7536923885345 seconds
              precision    recall  f1-score   support

           0       0.61      0.49      0.54      2501
           1       0.69      0.78      0.73      3591

    accuracy                           0.66      6092
   macro avg       0.65      0.63      0.64      6092
weighted avg       0.65      0.66      0.65      6092

val score: 0.6438730853391685
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
best score: 0.6537155275653073
test score 0.6605384110308602


## Pipeline & Randomized PCA

In [5]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA as RandomizedPCA
from sklearn.pipeline import make_pipeline

In [6]:
pca = RandomizedPCA(whiten=True)
svc = SVC(class_weight='balanced')
model = make_pipeline(pca, svc)

In [7]:
model.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(whiten=True)),
                ('svc', SVC(class_weight='balanced'))])

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {'pca__n_components': [5, 20],
              'svc__kernel': ['linear', 'rbf'],
              'svc__C': [0.1, 1, 10, 100],
              'svc__gamma': [0.00025, 0.0005, 0.001]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(X_train, y_train)
print(grid.best_params_)

CPU times: user 37min 56s, sys: 1min 7s, total: 39min 3s
Wall time: 37min 43s
{'pca__n_components': 20, 'svc__C': 100, 'svc__gamma': 0.0005, 'svc__kernel': 'rbf'}


In [9]:
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

In [11]:
start_time = time.time()

target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.00010919570922851562 seconds
              precision    recall  f1-score   support

   home_loss       0.58      0.67      0.62      2513
    home_win       0.74      0.66      0.70      3579

    accuracy                           0.67      6092
   macro avg       0.66      0.67      0.66      6092
weighted avg       0.68      0.67      0.67      6092

val score: 0.6570021881838074
{'pca__n_components': 20, 'svc__C': 100, 'svc__gamma': 0.0005, 'svc__kernel': 'rbf'}
best score: 0.6779306192655181
test score 0.6657912015758372
