In [1]:
import pandas as pd
import glob
import argparse
import os
import csv
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
allStats = pd.read_csv('~/MATH189/project/data/player_data_cleaned_as.csv')
allStats['PTS'][(allStats.Year == 1997)].mean()

522.5238095238095

In [103]:
def normalizeByYear(playerDf):
    normalizedDf = copy.deepcopy(playerDf)
    for year in playerDf.Year.unique():
        for column in playerDf.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', 'MVP'], axis=1, errors='ignore'):
            avgInYear = playerDf[column][(playerDf.Year == year)].mean()

In [105]:
def scaleData(playerDf, label):
    # Input: cleaned data pandas df
    # Output: scaled X data matrix with unecessary features removed, label vector, as numpy arrays
    X = playerDf.drop(['ID','Year','Player','Tm', label], axis=1, errors='ignore')
    y = playerDf.loc[:, playerDf.columns == label].values
    X = X.astype('float')
    X_scaled = StandardScaler().fit_transform(X)
    return X_scaled, y

In [106]:
def performPca(scaledData, n_components=.95):
    #Inputs: scaled data matrix, number of components for pca (starts off as just trying to get 95% of variance)
    #Outputs: sklearn pca class
    pca = PCA(n_components=n_components)
    principalComp = pca.fit_transform(scaledData)
    components = []
    for i in range(principalComp.shape[1]):
        components.append('PC-' + str(i))
    principalDf = pd.DataFrame(data=principalComp, columns=components)
    return pca, principalDf

In [107]:
def runCV(clf, X, y, k, scoring):
    # Run total cv
    y_pred = cross_val_predict(clf, X, y, cv=k)
    unique, counts = np.unique(y_pred, return_counts=True)
    conf_mat = confusion_matrix(y, y_pred)
    print(dict(zip(unique, counts)))
    print(conf_mat)

    # Run individual cv's
    scores = cross_validate(clf, X, y, cv=k, scoring=scoring)
    precision_mean = scores['test_precision_macro'].mean()
    recall_mean = scores['test_recall_macro'].mean()
    f1_mean = scores['test_f1'].mean()
    print('Mean Scores Cross Validation\nPrecision: ', precision_mean, '\nRecall: ', recall_mean, '\nF1: ', f1_mean, '\n\n')
#     print(scores)

In [108]:
def runSVM(X, y, kernel="rbf", C=1.0):
    X_train, X_test, y_train, y_test = extractTestAndTrainData(X,y)
    clf = svm.SVC(kernel=kernel, C=C)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return tn, fp, fn, tp

In [109]:
scoring = ['precision_macro', 'recall_macro', 'f1']
data_scaled, y = scaleData(allStats, 'AS')
y = y.reshape(y.shape[0],)
pca, principalDf = performPca(data_scaled, .999)
for i in range(-4,4):
    C = 2.0**i
    print("C parameter of: " + str(C) + " and linear kernel for cross validation")
    clf = svm.SVC(kernel="linear", C=C, random_state=0)
    runCV(clf,principalDf.values,y,k=5,scoring=scoring)

C parameter of: 0.0625 and linear kernel for cross validation
{0: 15060, 1: 757}
[[14703   158]
 [  357   599]]
Mean Scores Cross Validation
Precision:  0.884532551283417 
Recall:  0.807983187341476 
F1:  0.6992546695505473 


C parameter of: 0.125 and linear kernel for cross validation
{0: 15049, 1: 768}
[[14696   165]
 [  353   603]]
Mean Scores Cross Validation
Precision:  0.882389490746865 
Recall:  0.8098337725106205 
F1:  0.6994716603407471 


C parameter of: 0.25 and linear kernel for cross validation
{0: 15037, 1: 780}
[[14690   171]
 [  347   609]]
Mean Scores Cross Validation
Precision:  0.8810395226618223 
Recall:  0.8127596604042859 
F1:  0.7016427963108589 


C parameter of: 0.5 and linear kernel for cross validation
{0: 15037, 1: 780}
[[14689   172]
 [  348   608]]
Mean Scores Cross Validation
Precision:  0.8807215656297618 
Recall:  0.8122052023307493 
F1:  0.7005258123368749 


C parameter of: 1.0 and linear kernel for cross validation
{0: 15032, 1: 785}
[[14686   175]
