In [54]:
import pandas as pd
import glob
import argparse
import os
import csv
import numpy as np
import copy
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [52]:
def binData(statsDf, label, nbins=5):
    #This returns a new data frame with binned data to run random foresets and decision trees
    binDf = copy.deepcopy(statsDf)
    getMid = lambda interval : interval.mid
    for column in binDf.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', label], axis=1, errors='ignore'):
        series = pd.cut(binDf[column], nbins)
        binDf[column] = series.apply(getMid)
    return binDf

In [94]:
def runCVDtree(clf, X, y, k, scoring):
    # Run total cv
    y_pred = cross_val_predict(clf, X, y, cv=k)
    unique, counts = np.unique(y_pred, return_counts=True)
    conf_mat = confusion_matrix(y, y_pred)
    print(dict(zip(unique, counts)))
    print(conf_mat)

    # Run individual cv's
    scores = cross_validate(clf, X, y, cv=k, scoring=scoring)
    precision_mean = scores['test_precision_macro'].mean()
    recall_mean = scores['test_recall_macro'].mean()
    f1_mean = scores['test_f1'].mean()
    print('Mean Scores Cross Validation\nPrecision: ', precision_mean, '\nRecall: ', recall_mean, '\nF1: ', f1_mean, '\n\n')

In [28]:
def scaleData(playerDf, label):
    # Input: cleaned data pandas df
    # Output: scaled X data matrix with unecessary features removed, label vector, as numpy arrays
    y = playerDf.loc[:, playerDf.columns == label].values
    y = y.reshape(y.shape[0],)
    X = playerDf.drop(['ID','Year','Player','Tm', label], axis=1, errors='ignore')
    X = X.astype('float')
    X_scaled = StandardScaler().fit_transform(X)
    return X_scaled, y

In [29]:
def performPca(scaledData, n_components=.95):
    #Inputs: scaled data matrix, number of components for pca (starts off as just trying to get 95% of variance)
    #Outputs: sklearn pca class
    pca = PCA(n_components=n_components)
    principalComp = pca.fit_transform(scaledData)
    components = []
    for i in range(principalComp.shape[1]):
        components.append('PC-' + str(i))
    principalDf = pd.DataFrame(data=principalComp, columns=components)
    return pca, principalDf

In [89]:
def runDTree(df, nDepthsToTry, label):
    scoring = ['precision_macro', 'recall_macro', 'f1']
    X = df.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', label], axis=1, errors='ignore').values
    y = df[label].values
    for d in range(5, nDepthsToTry, 5):
        print('Depth: ', d)
        clf = DecisionTreeClassifier(random_state=0, max_depth=d, max_features=None, splitter="best")
        runCVDtree(clf, X, y, 5, scoring)

In [45]:
def randomForest(df, nDeptsToTry, label, pca=False):
    scoring = ['precision_macro', 'recall_macro', 'f1']
    X = df.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', label], axis=1, errors='ignore').values
    y = df[label].values
    for d in range(5, nDepthsToTry, 5):
        print('Depth: ', d)
        clf = DecisionTreeClassifier(random_state=0, max_depth=d, max_features=None, splitter="best")
        runCVDtree(clf, X, y, 5, scoring)

In [79]:
playerData = pd.read_csv('~/MATH189/project/data/player_data_cleaned.csv')
playerDataAs = pd.read_csv('~/MATH189/project/data/player_data_cleaned_as.csv')

In [93]:
runDTree(playerData, 25, 'MVP')

Depth:  5
{0: 15780, 1: 37}
[[15751    27]
 [   29    10]]
Mean Scores Cross Validation
Precision:  0.6739955740517709 
Recall:  0.629501559248129 
F1:  0.28918336565395386 


Depth:  10
{0: 15765, 1: 52}
[[15738    40]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6446748052586954 
Recall:  0.6523039010184467 
F1:  0.28179866110900587 


Depth:  15
{0: 15765, 1: 52}
[[15738    40]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6446748052586954 
Recall:  0.6523039010184467 
F1:  0.28179866110900587 


Depth:  20
{0: 15765, 1: 52}
[[15738    40]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6446748052586954 
Recall:  0.6523039010184467 
F1:  0.28179866110900587 




In [121]:
playerDataBinned = binData(playerData, 'MVP', nbins=55)
runDTree(playerDataBinned, 25, 'MVP')

Depth:  5
{0: 15788, 1: 29}
[[15762    16]
 [   26    13]]
Mean Scores Cross Validation
Precision:  0.748343280140849 
Recall:  0.6673501017069376 
F1:  0.39003496503496504 


Depth:  10
{0: 15780, 1: 37}
[[15753    25]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6908112823910054 
Recall:  0.6545648803892554 
F1:  0.31836829836829833 


Depth:  15
{0: 15780, 1: 37}
[[15753    25]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6908112823910054 
Recall:  0.6545648803892554 
F1:  0.31836829836829833 


Depth:  20
{0: 15780, 1: 37}
[[15753    25]
 [   27    12]]
Mean Scores Cross Validation
Precision:  0.6908112823910054 
Recall:  0.6545648803892554 
F1:  0.31836829836829833 




In [69]:
runDTree(playerDataAs, 25, 'AS')

Depth:  5
{0: 15100, 1: 717}
[[14677   184]
 [  423   533]]
Mean Scores Cross Validation
Precision:  0.8583528258719552 
Recall:  0.7726079532311312 
F1:  0.636512489142325 


Depth:  10
{0: 14909, 1: 908}
[[14524   337]
 [  385   571]]
Mean Scores Cross Validation
Precision:  0.8079412462296091 
Recall:  0.787301515356663 
F1:  0.6154662068128688 


Depth:  15
{0: 14844, 1: 973}
[[14477   384]
 [  367   589]]
Mean Scores Cross Validation
Precision:  0.7968204734214923 
Recall:  0.7951443309246848 
F1:  0.6144362105446851 


Depth:  20
{0: 14803, 1: 1014}
[[14432   429]
 [  371   585]]
Mean Scores Cross Validation
Precision:  0.7797970310227786 
Recall:  0.7915358902833723 
F1:  0.5960667964750275 




In [124]:
playerDataAsBinned = binData(playerDataAs, 'AS', nbins=55)
runDTree(playerDataAsBinned, 25, 'AS')

Depth:  5
{0: 15138, 1: 679}
[[14695   166]
 [  443   513]]
Mean Scores Cross Validation
Precision:  0.8704497708797685 
Recall:  0.7627153707114912 
F1:  0.6288825957147874 


Depth:  10
{0: 14986, 1: 831}
[[14579   282]
 [  407   549]]
Mean Scores Cross Validation
Precision:  0.821495384111633 
Recall:  0.7776497728585132 
F1:  0.6152661977477925 


Depth:  15
{0: 14868, 1: 949}
[[14482   379]
 [  386   570]]
Mean Scores Cross Validation
Precision:  0.7913606382003092 
Recall:  0.7853564942065633 
F1:  0.5987061363408956 


Depth:  20
{0: 14831, 1: 986}
[[14458   403]
 [  373   583]]
Mean Scores Cross Validation
Precision:  0.7861061266119173 
Recall:  0.7913553078228676 
F1:  0.601263368573543 


