# Machine Learning Final Project Group 12

### Imports

In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from helpers import (
    load_and_process_nba_data, 
    load_and_process_nba_height_data,
    get_dataset_summary, 
    get_height_dataset_summary,
    print_dataset_summary
)
from preprocessing import (
    clean_data
)
from knn_classifier import (
    knn_classifier
)
from lasso_classifier import (
    lasso_classifier
)
from xgboost_classifier import (
    xgboost_classifier
)
from random_forest_classifier import random_forest_classifier
from gradient_boosting_classifier import gradient_boosting_classifier

### Preprocessing

In [7]:
height2014_data_path = os.path.join('data')

print("\n\nProcessing NBA_Height.csv file...")
height_df = load_and_process_nba_height_data(height2014_data_path)
full_df = height_df

X, y = clean_data(full_df)

print(X.shape)



Processing NBA_Height.csv file...
   ID   GAME_ID                   MATCHUP LOCATION  W  FINAL_MARGIN  \
0   1  21400899  MAR 04, 2015 - CHA @ BKN        A  W            24   
1   2  21400899  MAR 04, 2015 - CHA @ BKN        A  W            24   
2   3  21400899  MAR 04, 2015 - CHA @ BKN        A  W            24   

   SHOT_NUMBER  PERIOD          GAME_CLOCK  SHOT_CLOCK  ...   made  \
0            1       1  12/30/1899 1:09:00        10.8  ...   True   
1            2       1  12/30/1899 0:14:00         3.4  ...  False   
2            3       1  12/30/1899 0:00:00         0.0  ...  False   

   time_remaining  quarter  distance_bin defender_distance_bin height_diff  \
0         1:09:00        1       3-10 ft        Tight (0-2 ft)        -5.0   
1         0:14:00        1        23+ ft     Wide Open (6+ ft)        -7.0   
2         0:00:00        1      10-16 ft        Tight (0-2 ft)        -7.0   

   team  opp       shot_type         player  
0   NaN  NaN  2PT Field Goal  brian rob

### KNN

#### Load classifier

In [10]:
# %load "./knn_classifier.py"
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from cross_validate import (
    grid_search
)

def knn_classifier(X, y, n_neighbors=20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining KNN Model\n')
    grid = {'n_neighbors':range(50,1001, 50)}
    knn = KNeighborsClassifier()
    (knn, results) = grid_search(knn, grid, X_train, y_train)
    
    print('\n\nTraining R2: ', knn.score(X_train, y_train))
    print('\nTesting R2: ', knn.score(X_test, y_test))

    # ax1 = results.plot.line(x='neighbors', y='train R2')
    # results.plot.line(x='neighbors', y='validation R2', ax=ax1)

    knn

    plt.show()

#### Run Classifier

In [11]:
knn_classifier(X, y)


Training KNN Model

 Optimal Parameters: {'n_neighbors': 500}
 Optimal Valid R2 = 0.6104795929414313


Training R2:  0.6140620232857973

Testing R2:  0.6122352570663727


### Lasso

In [13]:
# %load "./lasso_classifier.py"
from sklearn.linear_model import Lasso
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def lasso_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    print('\nTraining Lasso Model\n')
    lasso = Lasso()
    grid = {'alpha':[0.1,1,10]}
    (lasso, _) = grid_search(lasso, grid, X_train, y_train)

    coef = pd.Series(lasso.coef_, index=X.columns)
    print('\nLasso Coef Values: ', coef.sort_values())

    print('\n\nLasso Training R2: ', lasso.score(X_train, y_train))
    print('\nLasso Testing R2: ', lasso.score(X_test, y_test))
    

In [14]:
lasso_classifier(X, y.astype(int))


Training Lasso Model

 Optimal Parameters: {'alpha': 0.1}
 Optimal Valid R2 = -5.1306455981133446e-05

Lasso Coef Values:  FINAL_MARGIN                               0.0
SHOT_NUMBER                               -0.0
PERIOD                                    -0.0
SHOT_CLOCK                                 0.0
DRIBBLES                                  -0.0
TOUCH_TIME                                -0.0
SHOT_DIST                                 -0.0
PTS_TYPE                                  -0.0
CLOSEST_DEFENDER_PLAYER_ID                -0.0
CLOSE_DEF_DIST                            -0.0
player_id                                 -0.0
SHOOTER_height                             0.0
DEFENDER_height                            0.0
height_diff                                0.0
W_W                                        0.0
quarter_2                                 -0.0
quarter_3                                  0.0
quarter_4                                 -0.0
quarter_5                     

### Random Forest Classifier

In [16]:
# %load "./random_forest_classifier.py"
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def random_forest_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining Random Forest Classifier Model\n')
    grid = {'max_depth':[1,2,3,10,20],'n_estimators':[1000, 2000, 4000]}
    rfc = RandomForestClassifier()
    (rfc, results) = grid_search(rfc, grid, X_train, y_train)
    
    print('\n\nTraining Accuracy: ', rfc.score(X_train, y_train))
    print('\nTesting Accuracy: ', rfc.score(X_test, y_test))

    # ax1 = results.plot.line(x='nehibors', y='train R2')
    # results.plot.line(x='nehibors', y='validation R2', ax=ax1)

    rfc



In [17]:
random_forest_classifier(X, y)


Training Random Forest Classifier Model

 Optimal Parameters: {'max_depth': 10, 'n_estimators': 2000}
 Optimal Valid R2 = 0.6222504355014911


Training Accuracy:  0.641629021622526

Testing Accuracy:  0.6235335800330682


### Gradient Boosting Tree

In [21]:
# %load "./gradient_boosting_classifier.py"
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from cross_validate import grid_search
import matplotlib as plt
import time


def gradient_boosting_classifier(X, y):

    start_time = time.time()
    print('\nTraining Gradient Boosting Classifier Model\n')
    grid = {'learning_rate':[0.001,0.01, 0.1, 1],'n_estimators':[1000, 2000, 3000],'max_depth':[5,10,15]}
    gbt = GradientBoostingClassifier()

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()
    (gbtCV, results) = grid_search(gbt, grid, X_train, y_train)
    
    end_time = time.time()
    print(f"Time taken: {round(end_time-start_time, 1)}")

    print('\n\nTraining R2: ', gbtCV.score(X_train, y_train))
    print('\nTesting R2: ', gbtCV.score(X_test, y_test))

    # ax1 = results.plot.line(x='n_estimators', y='train R2')
    # results.plot.line(x='n_estimators', y='validation R2', ax=ax1)

    plt.show()

In [22]:
gradient_boosting_classifier(X, y)


Training Gradient Boosting Classifier Model



KeyboardInterrupt: 

In [10]:
# %load "./xgboost_classifier.py"
from xgboost_classifier import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from cross_validate import (
    grid_search
)

def xgboost_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)
    results = pd.DataFrame()

    print('\nTraining XGBoost Classifier Model\n')
    grid = {'max_depth':[20],'n_estimators':[1000], 'learning_rate':[.01, .1]}
    xgbc = XGBClassifier()
    (xgbc, results) = grid_search(xgbc, grid, X_train, y_train)
    
    print('\n\nTraining Accuracy: ', xgbc.score(X_train, y_train))
    print('\nTesting Accuracy: ', xgbc.score(X_test, y_test))

    # ax1 = results.plot.line(x='nehibors', y='train R2')
    # results.plot.line(x='nehibors', y='validation R2', ax=ax1)

    xgbc


In [11]:
xgboost_classifier(X, y)


Training XGBoost Classifier Model

 Optimal Parameters: {'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 1000}
 Optimal Valid R2 = 0.6015924099717539


Training Accuracy:  0.9998326886927081

Testing Accuracy:  0.6038107235650736
