In [57]:
# Hyper parameter tuning
file_name = "Rolling15Games.csv"
max_features = 12

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score
import itertools

# for easier reading np
np.set_printoptions(precision=3,suppress=True)
with open(f'../../data/{file_name}', 'r') as f: 
  temp = np.genfromtxt(f,delimiter=',', skip_header = 1)

In [3]:
#split feature matrix and label vector
X = temp[:, 1:]
y = temp[:, 0]

In [4]:
# split training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
print("Size of training feature matrix: "+str(X_train.shape))
print("Size of testing feature matrix: "+str(X_test.shape))
print("Size of training label vector: "+str(y_train.shape))
print("Size of testing label vector: "+str(y_test.shape))

Size of training feature matrix: (8161, 24)
Size of testing feature matrix: (3498, 24)
Size of training label vector: (8161,)
Size of testing label vector: (3498,)


In [42]:
# Scaling Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
col_dict = {'FGM_HOME': 0, 'FGA_HOME': 1, 'FG3M_HOME': 2, 'FTM_HOME': 3, 'FTA_HOME': 4, 'OREB_HOME': 5, 'DREB_HOME': 6,
            'AST_HOME': 7, 'STL_HOME': 8, 'TOV_HOME': 9, 'PF_HOME': 10, 'PTS_HOME': 11, 'FGM_AWAY': 12, 'FGA_AWAY': 13,
            'FG3M_AWAY': 14, 'FTM_AWAY': 15, 'FTA_AWAY': 16, 'OREB_AWAY': 17, 'DREB_AWAY': 18, 'AST_AWAY': 19,
            'STL_AWAY': 20, 'TOV_AWAY': 21, 'PF_AWAY': 22, 'PTS_AWAY': 23}

In [6]:
X_train_pts = X_train[:, [ col_dict['FGA_HOME'], col_dict['DREB_HOME'], col_dict['PTS_HOME'], 
                          col_dict['FGA_AWAY'], col_dict['OREB_AWAY'], col_dict['PTS_AWAY']]]
X_test_pts = X_test[:, [col_dict['FGA_HOME'], col_dict['DREB_HOME'], col_dict['PTS_HOME'], 
                          col_dict['FGA_AWAY'], col_dict['OREB_AWAY'], col_dict['PTS_AWAY']]]

In [7]:
clf = LogisticRegression(fit_intercept=True).fit(X_train_pts, y_train)
y_pred = clf.predict(X_test_pts)
print('Accuracy of logistic regression classifier on test set: {:.3f}'.format(accuracy_score(y_pred, y_test)))
cmat = confusion_matrix(y_test, y_pred)
print(cmat)

Accuracy of logistic regression classifier on test set: 0.615
[[ 532  936]
 [ 412 1618]]


In [60]:
# Two Potential Features
max_acc = 0
max_comb = ()


for num_features in range(1,max_features+1):
    for comb in itertools.combinations(range(12), num_features):
        iter_indices = list(comb)
        iter_indices += [i+12 for i in iter_indices]
        X_train_pair = X_train[:,iter_indices]
        X_test_pair = X_test[:,iter_indices]
        clf = LogisticRegression(fit_intercept=True).fit(X_train_pair, y_train)
        y_pred = clf.predict(X_test_pair)
        iter_acc = accuracy_score(y_pred, y_test)
        
        if iter_acc > max_acc:
            max_comb = comb
            max_acc = iter_acc

In [61]:
print("Max Accuracy", max_acc)
print("Indices", max_comb)
print("Features",[list(col_dict.keys())[i] for i in max_comb])

Max Accuracy 0.6449399656946827
Indices (1, 4, 5, 6, 7, 8, 9, 11)
Features ['FGA_HOME', 'FTA_HOME', 'OREB_HOME', 'DREB_HOME', 'AST_HOME', 'STL_HOME', 'TOV_HOME', 'PTS_HOME']
