In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
# import roc_auc_score
from sklearn.metrics import roc_auc_score


import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

In [2]:
all_csvs = ['log_reg', 'Random_Forests', 'SVC-GridSearch', 'SVC']

df = pd.DataFrame(index=['uniqueId'])
for csv in all_csvs:
    new_df = pd.read_csv("./{}.csv".format(csv), sep=";")
    new_df['date'] = pd.to_datetime(new_df['date'])
    new_df['uniqueId'] = new_df['date'].astype(str) + "-" + new_df['player_id'].astype(str) + "-" + new_df['gamePk'].astype(str) + "-" + new_df['target'].astype(str)
    new_df.sort_values(by=['uniqueId'], inplace=True)
    new_df.columns = [str(col) + '_{}'.format(csv) if str(col) not in \
        ['uniqueId', 'player_id', 'gamePk', 'date', 'odds_under', 'odds_over', 'num_shots', 'answer', 'target'] else str(col) for col in new_df.columns]
    new_df.set_index('uniqueId', inplace=True)
    df = pd.concat([df, new_df], axis=1)
df.dropna(inplace=True)
df = df.loc[:,~df.columns.duplicated()]
df['player_id'] = df['player_id'].astype(int)
df['gamePk'] = df['gamePk'].astype(int)
df['num_shots'] = df['num_shots'].astype(int)
df['answer'] = df['answer'].astype(int)

X = df.drop(['answer', 'odds_under', 'odds_over', 'num_shots', 'player_id', 'gamePk', 'date'], axis=1)
y = pd.DataFrame({"answer": df['answer']})

In [3]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 22)
X_train = X[:3500]
X_test = X[3500:]
y_train = y[:3500]
y_test = y[3500:]

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

SVM Classifier

In [4]:
#clf = svm.SVC(C=1, gamma=0.001, probability=True) # Optimized
clf = svm.SVC(C=0.525, gamma=0.0525, probability=True) # Generalized

clf.fit(X_train, y_train.values.ravel())
pred_clf = clf.predict(X_test)

save_df = pd.DataFrame(df[3500:][['player_id', 'gamePk', 'date', 'odds_under', 'odds_over', 'num_shots', 'answer', 'target']])
save_df["pred"] = pred_clf
pred_clf_prob = clf.predict_proba(X_test)
save_df["proba_under"] = [x[0] for x in pred_clf_prob]
save_df["proba_over"] = [x[1] for x in pred_clf_prob]


save_df.to_csv('.\my_g.csv', index=False, sep=";")

print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))
print(accuracy_score(y_test, pred_clf))

# Print the feature importance
#print(clf.feature_importances_)

# Print AUC ROC score
print(roc_auc_score(y_test, pred_clf))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59       645
           1       0.58      0.53      0.56       656

    accuracy                           0.57      1301
   macro avg       0.57      0.57      0.57      1301
weighted avg       0.57      0.57      0.57      1301

[[397 248]
 [307 349]]
0.5734050730207533


AttributeError: 'SVC' object has no attribute 'feature_importances_'

In [None]:
param_grid = { 'C': [0.575,0.55,0.525],
               'gamma': [0.0555,0.055,0.0525]}

clf2 = svm.SVC()
rf_Grid = GridSearchCV(clf2, param_grid, n_jobs=7, scoring='accuracy',verbose=2)
rf_Grid.fit(X_train,y_train.values.ravel())
pred_clf2 = rf_Grid.predict(X_test)

print(rf_Grid.best_params_)
print(rf_Grid.score(X_test,y_test))


Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'C': 0.525, 'gamma': 0.0525}
0.5734050730207533
