# Machine Learning Final Project Group 12

### Imports

In [18]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from helpers import (
    load_and_process_nba_data, 
    load_and_process_nba_height_data,
    get_dataset_summary, 
    get_height_dataset_summary,
    print_dataset_summary
)
from preprocessing import (
    clean_data
)
from knn_classifier import (
    knn_classifier
)
from lasso_classifier import (
    lasso_classifier
)
from xgboost_classifier import (
    xgboost_classifier
)
from random_forest_classifier import random_forest_classifier
from gradient_boosting_classifier import gradient_boosting_classifier

### Preprocessing

In [None]:
height2014_data_path = os.path.join('data')

print("\n\nProcessing NBA_Height.csv file...")
height_df = load_and_process_nba_height_data(height2014_data_path)
full_df = height_df
X, y = clean_data(full_df)

print(X.shape)
print(X.columns)

(f, t) = y.value_counts()
baseline = f/(t+f)
baseline



Processing NBA_Height.csv file...
   ID   GAME_ID                   MATCHUP  ...  opp       shot_type         player
0   1  21400899  MAR 04, 2015 - CHA @ BKN  ...  NaN  2PT Field Goal  brian roberts
1   2  21400899  MAR 04, 2015 - CHA @ BKN  ...  NaN  3PT Field Goal  brian roberts
2   3  21400899  MAR 04, 2015 - CHA @ BKN  ...  NaN  2PT Field Goal  brian roberts

[3 rows x 34 columns]

 Checking for Null Values.
ID                       0
GAME_ID                  0
MATCHUP                  0
LOCATION                 0
W                        0
FINAL_MARGIN             0
SHOT_NUMBER              0
PERIOD                   0
GAME_CLOCK               0
SHOT_CLOCK               0
DRIBBLES                 0
TOUCH_TIME               0
SHOT_DIST                0
PTS_TYPE                 0
SHOT_RESULT              0
CLOSEST_DEFENDER         0
CLOSE_DEF_DIST           0
FGM                      0
PTS                      0
player_name              0
player_id                0
CLOSEST_DEFEND

0.5476934705414577

### KNN

#### Load classifier

In [22]:
# %load "./knn_classifier.py"
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from cross_validate import (
    grid_search
)

def knn_classifier(X, y, n_neighbors=20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining KNN Model\n')
    grid = {'n_neighbors':range(50,1001, 50)}
    knn = KNeighborsClassifier(n_jobs=-1)
    (knn, results) = grid_search(knn, grid, X_train, y_train)
    
    print('\n\nTraining R2: ', knn.score(X_train, y_train))
    print('\nTesting R2: ', knn.score(X_test, y_test))

    print(results.columns)

    # ax1 = results.plot.line(x='n_neighbors', y='train R2')
    # results.plot.line(x='n_neighbors', y='validation R2', ax=ax1)

    knn

    # plt.show()

#### Run Classifier

In [23]:
knn_classifier(X, y)


Training KNN Model

 Optimal Parameters: {'n_neighbors': 500}
 Optimal Valid R2 = 0.6104795929414313


Training R2:  0.6140620232857973

Testing R2:  0.6122352570663727
Index(['train R2', 'validation R2'], dtype='object')


### Lasso

In [24]:
# %load "./lasso_classifier.py"
from sklearn.linear_model import Lasso
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def lasso_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    print('\nTraining Lasso Model\n')
    lasso = Lasso()
    grid = {'alpha':[0.1,1,10]}
    (lasso, _) = grid_search(lasso, grid, X_train, y_train)

    coef = pd.Series(lasso.coef_, index=X.columns)
    print('\nLasso Coef Values: ', coef.sort_values())

    print('\n\nLasso Training R2: ', lasso.score(X_train, y_train))
    print('\nLasso Testing R2: ', lasso.score(X_test, y_test))
    

In [25]:
lasso_classifier(X, y.astype(int))


Training Lasso Model

 Optimal Parameters: {'alpha': 0.1}
 Optimal Valid R2 = -5.1306455981133446e-05

Lasso Coef Values:  FINAL_MARGIN                               0.0
SHOT_NUMBER                               -0.0
PERIOD                                    -0.0
SHOT_CLOCK                                 0.0
DRIBBLES                                  -0.0
TOUCH_TIME                                -0.0
SHOT_DIST                                 -0.0
PTS_TYPE                                  -0.0
CLOSEST_DEFENDER_PLAYER_ID                -0.0
CLOSE_DEF_DIST                            -0.0
player_id                                 -0.0
SHOOTER_height                             0.0
DEFENDER_height                            0.0
height_diff                                0.0
W_W                                        0.0
quarter_2                                 -0.0
quarter_3                                  0.0
quarter_4                                 -0.0
quarter_5                     

### Random Forest Classifier

In [26]:
# %load "./random_forest_classifier.py"
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def random_forest_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining Random Forest Classifier Model\n')
    #grid = {'max_depth':[1,2,3,10,20],'n_estimators':[1000, 2000, 4000]}
    #grid = {'max_depth':[7,8,9,10,11,12],'n_estimators':[1500, 2000, 3000]}
    grid = {'max_depth':[10,11,12,13,14],'n_estimators':[1000, 1500, 2000]}
    rfc = RandomForestClassifier(n_jobs=-1, random_state=0)
    (rfc, results) = grid_search(rfc, grid, X_train, y_train)
    
    print('\n\nTraining Accuracy: ', rfc.score(X_train, y_train))
    print('\nTesting Accuracy: ', rfc.score(X_test, y_test))

    # ax1 = results.plot.line(x='max_depth', y='train R2')
    # results.plot.line(x='max_depth', y='validation R2', ax=ax1)

    rfc



In [27]:
random_forest_classifier(X, y)


Training Random Forest Classifier Model

 Optimal Parameters: {'max_depth': 12, 'n_estimators': 1000}
 Optimal Valid R2 = 0.6224964815416261


Training Accuracy:  0.6713513832708377

Testing Accuracy:  0.6224706716006614


### Gradient Boosting Tree

In [28]:
# %load "./gradient_boosting_classifier.py"
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from cross_validate import grid_search
import matplotlib as plt
import time


def gradient_boosting_classifier(X, y):
    start_time = time.time()
    print('\nTraining Gradient Boosting Classifier Model\n')
    grid = {'learning_rate':[0.01, 0.1, 1],'n_estimators':[30, 35, 40, 45, 50, 55, 60],'max_depth':[3, 4, 5, 6, 7]}
    gbt = GradientBoostingClassifier(random_state=0)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()
    (gbtCV, results) = grid_search(gbt, grid, X_train, y_train)
    
    end_time = time.time()
    print(f"Time taken: {round(end_time-start_time, 1)}")

    print('\n\nTraining Accuracy: ', gbtCV.score(X_train, y_train))
    print('\nTesting Accuracy: ', gbtCV.score(X_test, y_test))

    # ax1 = results.plot.line(x='n_estimators', y='train R2')
    # results.plot.line(x='n_estimators', y='validation R2', ax=ax1)

    #plt.show()

In [57]:
gradient_boosting_classifier(X, y)


Training Gradient Boosting Classifier Model

 Optimal Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
 Optimal Valid R2 = 0.6224964815416261
Time taken: 366.3


Training Accuracy:  0.6315214502937789

Testing Accuracy:  0.6226675064955516


### XGBoost

In [35]:
# %load "./xgboost_classifier.py"
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def xgboost_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining XGBoost Classifier Model\n')
    grid = {'max_depth':[2, 3, 4, 5],'n_estimators':[2000, 3000, 4000], 'learning_rate':[.01, .1, 1]}
    xgbc = XGBClassifier()
    (xgbc, results) = grid_search(xgbc, grid, X_train, y_train)
    
    print('\n\nTraining Accuracy: ', xgbc.score(X_train, y_train))
    print('\nTesting Accuracy: ', xgbc.score(X_test, y_test))

    # ax1 = results.plot.line(x='nehibors', y='train R2')
    # results.plot.line(x='nehibors', y='validation R2', ax=ax1)

    xgbc


### Logistic Regressor

In [55]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def logistic_regressor(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining Linear Regressor Classifier Model\n')
    logisticRegressor = LogisticRegression(fit_intercept=False)
    lr = logisticRegressor.fit(X_train, y_train)
    
    print('\n\nTraining Accuracy: ', lr.score(X_train, y_train))
    print('\nTesting Accuracy: ', lr.score(X_test, y_test))

    # ax1 = results.plot.line(x='nehibors', y='train R2')
    # results.plot.line(x='nehibors', y='validation R2', ax=ax1)

    lr

In [56]:
logistic_regressor(X,y)


Training Linear Regressor Classifier Model



Training Accuracy:  0.6172409381243418

Testing Accuracy:  0.6151484135107472


In [33]:
# %load "./feature_selection.py"
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

def pca(X, y, n_components=0.95):
    # Split the dataset into training and test sets

    # Perform PCA
    print('\nApplying PCA...')
    pca = PCA(n_components=n_components)  # You can set n_components to an integer or a percentage
    X_PCA = pca.fit_transform(X)  # Fit and transform training data

    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
    print(f"Number of components after PCA: {X_PCA.shape[1]}")
    
    components = pd.DataFrame(pca.components_, columns=X.columns)
    print(components)
    
    sorted_indices = pca.explained_variance_ratio_.argsort()[::-1]  # Sort in descending order

    # Select top 14 components (although PCA has 14 components, this ensures it's done correctly)
    top_components = components.iloc[sorted_indices[:14]]

    # You can also print the explained variance for each of the top components
    top_variance = pca.explained_variance_ratio_[sorted_indices[:14]]

    # Output the top components and their explained variance
    print("Top 14 Components (sorted by explained variance):")
    for i in range(len(sorted_indices[:14])):
        print("Component",  (i+1), " - Explained Variance: ", top_variance[i])
        print(top_components.iloc[i].sort_values(ascending=False))
        print()


    return X_PCA

In [34]:
X_pca = pca(X,y)
#xgboost_classifier(X_pca, y)


Applying PCA...
Explained variance ratio: [0.16601358 0.15073553 0.10593    0.09302109 0.07568075 0.07027472
 0.06096364 0.05931274 0.05303285 0.03836627 0.02342872 0.02215499
 0.01616311 0.01502764]
Number of components after PCA: 14
    FINAL_MARGIN  SHOT_NUMBER    PERIOD  SHOT_CLOCK  DRIBBLES  TOUCH_TIME  \
0       0.015251    -0.092325 -0.040179   -0.004682 -0.368083   -0.372387   
1      -0.028823    -0.220617 -0.197920    0.134278 -0.414552   -0.394083   
2      -0.019482     0.607537  0.648125   -0.023069 -0.126241   -0.116156   
3      -0.012467     0.146071  0.157332    0.041386 -0.166215   -0.165702   
4       0.822919     0.005469 -0.009750   -0.138932  0.018237    0.039066   
5       0.228269     0.024181  0.057725    0.679147 -0.126189   -0.201206   
6      -0.236698    -0.024908 -0.014474   -0.259909 -0.060559   -0.043672   
7       0.220044    -0.011186  0.000550   -0.375619  0.019714    0.038687   
8      -0.058063     0.056895 -0.028752    0.424767  0.311507    0.3233

In [36]:
xgboost_classifier(X, y)


Training XGBoost Classifier Model

 Optimal Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 3000}
 Optimal Valid R2 = 0.622860629681026


Training Accuracy:  0.6311868276791953

Testing Accuracy:  0.6243602865916069
