In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
with open('./data/nba_games.csv', 'r') as f: 
  temp = np.genfromtxt(f,delimiter=',', skip_header = 1)

In [2]:
#split feature matrix and label vector
X = temp[:, 1:]
y = temp[:, 0]

In [3]:
# split training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
print("Size of training feature matrix: "+str(X_train.shape))
print("Size of testing feature matrix: "+str(X_test.shape))
print("Size of training label vector: "+str(y_train.shape))
print("Size of testing label vector: "+str(y_test.shape))

Size of training feature matrix: (14305, 24)
Size of testing feature matrix: (6131, 24)
Size of training label vector: (14305,)
Size of testing label vector: (6131,)


In [4]:
import helper as h
col_dict = h.getColDict()
col_dict

{'FGM_HOME': 0,
 'FGA_HOME': 1,
 'FG3M_HOME': 2,
 'FTM_HOME': 3,
 'FTA_HOME': 4,
 'OREB_HOME': 5,
 'DREB_HOME': 6,
 'AST_HOME': 7,
 'STL_HOME': 8,
 'TOV_HOME': 9,
 'PF_HOME': 10,
 'PTS_HOME': 11,
 'FGM_AWAY': 12,
 'FGA_AWAY': 13,
 'FG3M_AWAY': 14,
 'FTM_AWAY': 15,
 'FTA_AWAY': 16,
 'OREB_AWAY': 17,
 'DREB_AWAY': 18,
 'AST_AWAY': 19,
 'STL_AWAY': 20,
 'TOV_AWAY': 21,
 'PF_AWAY': 22,
 'PTS_AWAY': 23}

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# feature selection

X_train_pts = X_train[:, [ col_dict['FGA_HOME'], col_dict['DREB_HOME'], col_dict['PTS_HOME'], 
                          col_dict['FGA_AWAY'], col_dict['OREB_AWAY'], col_dict['PTS_AWAY']]]
X_test_pts = X_test[:, [col_dict['FGA_HOME'], col_dict['DREB_HOME'], col_dict['PTS_HOME'], 
                          col_dict['FGA_AWAY'], col_dict['OREB_AWAY'], col_dict['PTS_AWAY']]]

In [6]:
X_train_pts

array([[ 84.90243902,  34.29268293,  98.82926829,  83.41463415,
         11.73170732, 106.82926829],
       [ 79.41463415,  31.34146341, 107.6097561 ,  81.56097561,
         10.65853659,  98.82926829],
       [ 87.2195122 ,  29.80487805, 102.07317073,  79.90243902,
         11.90243902,  96.6097561 ],
       ...,
       [ 79.36585366,  29.68292683,  94.58536585,  83.12195122,
         10.92682927,  97.29268293],
       [ 76.95121951,  28.31707317,  93.6097561 ,  79.90243902,
         11.90243902,  96.6097561 ],
       [ 83.14634146,  29.53658537, 102.02439024,  80.31707317,
         12.        ,  95.2195122 ]])

In [7]:
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train_pts, y_train)
y_hat = clf.predict(X_test_pts)

In [8]:
# Determine DT performance
from sklearn.metrics import confusion_matrix

Cmat = confusion_matrix(y_test, y_hat)
acc = clf.score(X_test_pts,y_test)

print("Confusion Matrix: ")
print(Cmat)
print("Accuracy: " + str(format(acc*100,'.2f')) + '%')

Confusion Matrix: 
[[1337 1180]
 [1366 2248]]
Accuracy: 58.47%
