# Using Machine Learning to Find Similar NBA Players

In [1]:
# Import packages

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Import scikit-learn moduldes
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Import Data

In [2]:
# User path to data file
file = "/Users/mini/Documents/nba_data/data_files/player_data_1981-2017.csv"
data = pd.read_csv(file)

In [3]:
data.index

RangeIndex(start=0, stop=15234, step=1)

In [4]:
data.columns.values

array(['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP_tot',
       'FG_tot', 'FGA_tot', 'FG_perc', '3P_tot', '3PA_tot', '3P_perc',
       '2P_tot', '2PA_tot', '2P_perc', 'eFG_perc', 'FT_tot', 'FTA_tot',
       'FT_perc', 'ORB_tot', 'DRB_tot', 'TRB_tot', 'AST_tot', 'STL_tot',
       'BLK_tot', 'TOV_tot', 'PF_tot', 'PTS_tot', 'MP_per_G', 'FG_per_G',
       'FGA_per_G', '3P_per_G', '3PA_per_G', '2P_per_G', '2PA_per_G',
       'FT_per_G', 'FTA_per_G', 'ORB_per_G', 'DRB_per_G', 'TRB_per_G',
       'AST_per_G', 'STL_per_G', 'BLK_per_G', 'TOV_per_G', 'PF_per_G',
       'PTS_per_G', 'FG_per_36m', 'FGA_per_36m', '3P_per_36m',
       '3PA_per_36m', '2P_per_36m', '2PA_per_36m', 'FT_per_36m',
       'FTA_per_36m', 'ORB_per_36m', 'DRB_per_36m', 'TRB_per_36m',
       'AST_per_36m', 'STL_per_36m', 'BLK_per_36m', 'TOV_per_36m',
       'PF_per_36m', 'PTS_per_36m', 'PER', 'TS_perc', '3PAr', 'FTr',
       'ORB_perc', 'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc',
       'BLK_perc', 'TOV_perc', '

In [5]:
data.shape

(15234, 106)

## Data Cleaning

In [6]:
# Subset data to G >= 27, MP_tot >= 750
data = data[(data.G >= 27) & (data.MP_tot >= 750)]

In [7]:
tot_list = data.columns[8:30]

In [8]:
per_100p_list = data.columns[-20:-3]

In [9]:
drop_cols = []
for col in tot_list:
    drop_cols.append(col)
    
for col in per_100p_list:
    drop_cols.append(col)

In [10]:
# Drop unwanted columns
data.drop(drop_cols, axis = 1, inplace=True)

In [11]:
# Reset indices for querying later
data = data.reset_index()

In [12]:
len(data)

9872

In [13]:
list(data.columns)

['index',
 'Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 'WS',
 'WS_per_48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP',
 'MP',
 'ORtg',
 'DRtg',
 'Rounded_Pos']

## Data Preprocessing

In [14]:
# Assign feature and target values
# Exclude things that are labels and not useful statistics
x = data.drop(['Season', 'Player', 'Pos', 'Tm', 'G', 'GS', 'Rounded_Pos'], axis = 1)
y = data['Pos']

In [15]:
x.columns.values

array(['index', 'Age', 'MP_tot', 'MP_per_G', 'FG_per_G', 'FGA_per_G',
       '3P_per_G', '3PA_per_G', '2P_per_G', '2PA_per_G', 'FT_per_G',
       'FTA_per_G', 'ORB_per_G', 'DRB_per_G', 'TRB_per_G', 'AST_per_G',
       'STL_per_G', 'BLK_per_G', 'TOV_per_G', 'PF_per_G', 'PTS_per_G',
       'FG_per_36m', 'FGA_per_36m', '3P_per_36m', '3PA_per_36m',
       '2P_per_36m', '2PA_per_36m', 'FT_per_36m', 'FTA_per_36m',
       'ORB_per_36m', 'DRB_per_36m', 'TRB_per_36m', 'AST_per_36m',
       'STL_per_36m', 'BLK_per_36m', 'TOV_per_36m', 'PF_per_36m',
       'PTS_per_36m', 'PER', 'TS_perc', '3PAr', 'FTr', 'ORB_perc',
       'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc', 'BLK_perc',
       'TOV_perc', 'USG_perc', 'OWS', 'DWS', 'WS', 'WS_per_48', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'MP', 'ORtg', 'DRtg'], dtype=object)

In [16]:
# Scale data for dimensionality reduction
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

## Dimension Reduction

In [17]:
pca = PCA(n_components = 2)
pca.fit(x_scaled)

x_pca = pca.transform(x_scaled)
print(pca.explained_variance_ratio_.sum())

0.563165173562


In [18]:
pca.explained_variance_ratio_

array([ 0.33363118,  0.22953399])

In [19]:
LDA = LinearDiscriminantAnalysis(n_components = 2, solver = 'eigen' , shrinkage = 'auto')

LDA_reduced_df = LDA.fit_transform(x_scaled, y)



In [20]:
LDA.score(x_scaled, y)

0.66997568881685576

## Search with KDTree

In [48]:
# Use KDTree to find nearest neighbors
tree = KDTree(x_scaled, leaf_size=40, metric='euclidean')
dist, ind = tree.query([x_scaled[5632]], k = 30)
print(ind)
print(dist)

[[5632 5915 5553 5460 4846 5182 6645 5121 8491 4869 5129 5730 1712 6404
  9018 8464 3055 7758 6774 7804 9000 6211 9084 8580 8773 8887 5056 6864
  6478 8199]]
[[ 0.          2.4217271   3.17640459  3.29161682  3.47161345  3.47753246
   3.56638524  3.57915223  3.6026154   3.64279319  3.65137503  3.67174047
   3.69002173  3.75228337  3.76896113  3.78835012  3.84860257  3.88219377
   3.91786142  3.91816122  3.92752207  3.94632954  4.00130952  4.02105461
   4.10328949  4.18792061  4.19275674  4.24449254  4.25133234  4.25596283]]


In [22]:
data[(data['Player'] == 'Tony Parker') & (data['Season'] == '2002-03')]

Unnamed: 0,index,Season,Player,Pos,Age,Tm,G,GS,MP_tot,MP_per_G,...,WS,WS_per_48,OBPM,DBPM,BPM,VORP,MP,ORtg,DRtg,Rounded_Pos
5632,8610,2002-03,Tony Parker,PG,20.0,SAS,82.0,82.0,2774.0,33.8,...,7.7,0.134,1.9,-1.7,0.3,1.6,2774.0,108.0,104.0,1


In [33]:
data.iloc[5632]

index                 8610
Season             2002-03
Player         Tony Parker
Pos                     PG
Age                     20
Tm                     SAS
G                       82
GS                      82
MP_tot                2774
MP_per_G              33.8
FG_per_G               5.9
FGA_per_G             12.7
3P_per_G                 1
3PA_per_G                3
2P_per_G               4.9
2PA_per_G              9.8
FT_per_G               2.7
FTA_per_G              3.5
ORB_per_G              0.4
DRB_per_G              2.2
TRB_per_G              2.6
AST_per_G              5.3
STL_per_G              0.9
BLK_per_G                0
TOV_per_G              2.4
PF_per_G               2.1
PTS_per_G             15.5
FG_per_36m             6.3
FGA_per_36m           13.5
3P_per_36m             1.1
                  ...     
AST_per_36m            5.6
STL_per_36m            0.9
BLK_per_36m            0.1
TOV_per_36m            2.6
PF_per_36m             2.3
PTS_per_36m           16.5
P

In [31]:
data.iloc[5553]

index                 8488
Season             2002-03
Player         Troy Hudson
Pos                     PG
Age                     26
Tm                     MIN
G                       79
GS                      74
MP_tot                2600
MP_per_G              32.9
FG_per_G               5.2
FGA_per_G             12.1
3P_per_G               1.2
3PA_per_G              3.4
2P_per_G               3.9
2PA_per_G              8.7
FT_per_G               2.6
FTA_per_G              2.9
ORB_per_G              0.5
DRB_per_G              1.8
TRB_per_G              2.3
AST_per_G              5.7
STL_per_G              0.8
BLK_per_G              0.1
TOV_per_G              2.3
PF_per_G                 2
PTS_per_G             14.2
FG_per_36m             5.7
FGA_per_36m           13.2
3P_per_36m             1.3
                  ...     
AST_per_36m            6.3
STL_per_36m            0.8
BLK_per_36m            0.1
TOV_per_36m            2.5
PF_per_36m             2.2
PTS_per_36m           15.5
P

In [47]:
data[['Player', 'Season']].iloc[9018]

Player    Goran Dragic
Season         2014-15
Name: 9018, dtype: object