In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

In [9]:
all_csvs = ['log_reg', 'Random_Forests', 'SVC-GridSearch', 'SVC']

df = pd.DataFrame()
for csv in all_csvs:
    new_df = pd.read_csv("./{}.csv".format(csv))
    new_df['date']=pd.to_datetime(new_df['date'])
    new_df.sort_values(by=['date'])
    new_df = new_df.drop(['date', 'player_id', 'gamePk', 'num_shots', 'pred_over', 'pred_under', 'target'], axis=1)
    new_df["proba_over"] = new_df["proba_over"] / (new_df["proba_over"] +  new_df["proba_under"])
    new_df["proba_under"] = new_df["proba_under"] / (new_df["proba_over"] +  new_df["proba_under"])
    new_df["pred"] = (new_df["proba_over"] > new_df["proba_under"]).astype(int)
    new_df.columns = [str(col) + '_{}'.format(csv) for col in new_df.columns]
    df = pd.concat([df, new_df], axis=1)
df.dropna(inplace=True)
X = df.drop(['answer_log_reg', 'answer_Random_Forests', 'answer_SVC-GridSearch', 'answer_SVC',\
    'odds_under_log_reg', 'odds_under_Random_Forests', 'odds_under_SVC-GridSearch','odds_under_SVC',\
    'odds_over_log_reg', 'odds_over_Random_Forests', 'odds_over_SVC-GridSearch', 'odds_over_SVC'], axis=1)
X["odds_over"] = df['odds_over_SVC']
X["odds_under"] = df['odds_under_SVC']
y = pd.DataFrame({"answer": df['answer_SVC']})

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 22)
X_train_raw = X[:3500]
X_test_raw = X[3500:]
y_train_raw = y[:3500]
y_test_raw = y[3500:]

sc = StandardScaler()
X_train = sc.fit_transform(X_train_raw)
X_test = sc.transform(X_test_raw)

SVM Classifier

In [11]:
clf = svm.SVC(kernel = 'rbf', gamma = 0.001, C = 1, probability=True)

clf.fit(X_train, y_train_raw)
pred_clf = clf.predict(X_test)
save_df = pd.concat([X_test_raw.copy(), y_test_raw.copy()], axis=1)
save_df["pred_my"] = pred_clf
pred_clf_prob = clf.predict_proba(X_test)
save_df["pred_under_my"] = [x[0] for x in pred_clf_prob]
save_df["pred_over_my"] = [x[1] for x in pred_clf_prob]


for csv in all_csvs + ["my"]:
    save_df["win_{}".format(csv)] = (save_df["pred_{}".format(csv)] == save_df["answer"]).astype(int)
save_df["win_odds"] = ((save_df["odds_over"] <= save_df["odds_under"]) == save_df["answer"]).astype(int)



#save_df.to_csv(r'C:\Users\rynel\Desktop\my.csv', index=False, sep=";")

print(classification_report(y_test_raw, pred_clf))
print(confusion_matrix(y_test_raw, pred_clf))
print(accuracy_score(y_test_raw, pred_clf))

  return f(*args, **kwargs)
              precision    recall  f1-score   support

           0       0.57      0.63      0.60       638
           1       0.60      0.54      0.57       663

    accuracy                           0.58      1301
   macro avg       0.58      0.58      0.58      1301
weighted avg       0.58      0.58      0.58      1301

[[399 239]
 [304 359]]
0.5826287471176018


In [12]:
param_grid = { 'C': [0.1, 1, 10, 100],
                'gamma': [1, 0.1, 0.01, 0.001],
                'kernel': ['rbf']}

clf2 = svm.SVC(gamma='auto')
rf_Grid = RandomizedSearchCV(estimator = clf2, param_distributions = param_grid, cv = 5, verbose=2, n_jobs=5)
rf_Grid.fit(X_train,y_train_raw)
print(rf_Grid.best_params_)
print(rf_Grid.score(X_test,y_test_raw))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
  return f(*args, **kwargs)
{'kernel': 'rbf', 'gamma': 0.01, 'C': 0.1}
0.5818601076095311
