In [6]:
import pandas as pd
import glob
import argparse
import os
import csv
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [7]:
allStats = pd.read_csv('~/MATH189/project/data/player_data_cleaned.csv')
d = copy.deepcopy(allStats)
means = allStats[(allStats.Year == 1979)].mean()
means = means.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', 'MVP', 'Age'], axis=0, errors='ignore')
means.PTS

710.9142857142857

In [8]:
# def normalizeByYear(playerDf):
#     normalizedDf = copy.deepcopy(playerDf)
#     for year in playerDf.Year.unique():
#         for column in playerDf.drop(['ID', 'Year', 'Player', 'Pos', 'Tm', 'MVP'], axis=1, errors='ignore'):
#             avgInYear = playerDf[column][(playerDf.Year == year)].mean()
#             if avgInYear == 0:
#                 avgInYear = .01
#             normalizedDf[column][(normalizedDf.Year == year)] = normalizedDf[column][(normalizedDf.Year == year)].div(avgInYear)
#     return normalizedDf

In [9]:
# normalizedDf = normalizeByYear(allStats)

In [10]:
def scaleData(playerDf, label):
    # Input: cleaned data pandas df
    # Output: scaled X data matrix with unecessary features removed, label vector, as numpy arrays
    X = playerDf.drop(['ID','Year','Player','Tm', label], axis=1, errors='ignore')
    y = playerDf.loc[:, playerDf.columns == label].values
    X = X.astype('float')
    X_scaled = StandardScaler().fit_transform(X)
    return X_scaled, y

In [11]:
def performPca(scaledData, n_components=.95):
    #Inputs: scaled data matrix, number of components for pca (starts off as just trying to get 95% of variance)
    #Outputs: sklearn pca class
    pca = PCA(n_components=n_components)
    principalComp = pca.fit_transform(scaledData)
    components = []
    for i in range(principalComp.shape[1]):
        components.append('PC-' + str(i))
    principalDf = pd.DataFrame(data=principalComp, columns=components)
    return pca, principalDf

In [12]:
def runCV(clf, X, y, k, scoring):
    # Run total cv
    y_pred = cross_val_predict(clf, X, y, cv=k)
    unique, counts = np.unique(y_pred, return_counts=True)
    conf_mat = confusion_matrix(y, y_pred)
    print(dict(zip(unique, counts)))
    print(conf_mat)

    # Run individual cv's
    scores = cross_validate(clf, X, y, cv=k, scoring=scoring)
    precision_mean = scores['test_precision_macro'].mean()
    recall_mean = scores['test_recall_macro'].mean()
    f1_mean = scores['test_f1'].mean()
    print('Mean Scores Cross Validation\nPrecision: ', precision_mean, '\nRecall: ', recall_mean, '\nF1: ', f1_mean, '\n\n')
#     print(scores)

In [13]:
def runSVM(X, y, kernel="rbf", C=1.0):
    X_train, X_test, y_train, y_test = extractTestAndTrainData(X,y)
    clf = svm.SVC(kernel=kernel, C=C)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return tn, fp, fn, tp

In [15]:
scoring = ['precision_macro', 'recall_macro', 'f1']
data_scaled, y = scaleData(allStats, 'MVP')
y = y.reshape(y.shape[0],)
pca, principalDf = performPca(data_scaled, .999)
for i in range(-4,4):
    C = 2.0**i
    print("C parameter of: " + str(C) + " and linear kernel for cross validation")
    clf = svm.SVC(kernel="linear", C=C, random_state=0)
    runCV(clf,principalDf.values,y,k=5,scoring=scoring)

C parameter of: 0.0625 and linear kernel for cross validation
{0: 15811, 1: 6}
[[15776     2]
 [   35     4]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Mean Scores Cross Validation
Precision:  0.748893195521851 
Recall:  0.5499366186008489 
F1:  0.16444444444444445 


C parameter of: 0.125 and linear kernel for cross validation
{0: 15805, 1: 12}
[[15775     3]
 [   30     9]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Mean Scores Cross Validation
Precision:  0.7790510530105703 
Recall:  0.6141906472084896 
F1:  0.31661227661227664 


C parameter of: 0.25 and linear kernel for cross validation
{0: 15803, 1: 14}
[[15775     3]
 [   28    11]]
Mean Scores Cross Validation
Precision:  0.8957808208178228 
Recall:  0.6391906472084896 
F1:  0.3937839937839938 


C parameter of: 0.5 and linear kernel for cross validation
{0: 15792, 1: 25}
[[15769     9]
 [   23    16]]
Mean Scores Cross Validation
Precision:  0.846176552960803 
Recall:  0.705071951668473 
F1:  0.5052813852813853 


C parameter of: 1.0 and linear kernel for cross validation
{0: 15785, 1: 32}
[[15765    13]
 [   20    19]]
Mean Scores Cross Validation
Precision:  0.8471154044243138 
Recall:  0.7442309131988891 
F1:  0.5524685840475314 


C parameter of: 2.0 and linear kernel for cross validation
{0: 15782, 1: 35}
[[15762    16]
 [   20    19]]
Mean Scores Cross Validation
Precision:  0.796877188911667 
Recall:  0.7441358461216644 
F1:  0.5250