# Using Machine Learning to Find Similar NBA Players

In [1]:
# Import packages

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Import scikit-learn moduldes
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Import Data

In [2]:
# User path to data file
file = "/Users/mini/Documents/nba_data/data_files/player_data_1981-2017.csv"
data = pd.read_csv(file)

In [4]:
data.columns.values

array(['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP_tot',
       'FG_tot', 'FGA_tot', 'FG_perc', '3P_tot', '3PA_tot', '3P_perc',
       '2P_tot', '2PA_tot', '2P_perc', 'eFG_perc', 'FT_tot', 'FTA_tot',
       'FT_perc', 'ORB_tot', 'DRB_tot', 'TRB_tot', 'AST_tot', 'STL_tot',
       'BLK_tot', 'TOV_tot', 'PF_tot', 'PTS_tot', 'MP_per_G', 'FG_per_G',
       'FGA_per_G', '3P_per_G', '3PA_per_G', '2P_per_G', '2PA_per_G',
       'FT_per_G', 'FTA_per_G', 'ORB_per_G', 'DRB_per_G', 'TRB_per_G',
       'AST_per_G', 'STL_per_G', 'BLK_per_G', 'TOV_per_G', 'PF_per_G',
       'PTS_per_G', 'FG_per_36m', 'FGA_per_36m', '3P_per_36m',
       '3PA_per_36m', '2P_per_36m', '2PA_per_36m', 'FT_per_36m',
       'FTA_per_36m', 'ORB_per_36m', 'DRB_per_36m', 'TRB_per_36m',
       'AST_per_36m', 'STL_per_36m', 'BLK_per_36m', 'TOV_per_36m',
       'PF_per_36m', 'PTS_per_36m', 'PER', 'TS_perc', '3PAr', 'FTr',
       'ORB_perc', 'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc',
       'BLK_perc', 'TOV_perc', '

In [5]:
data.shape

(15234, 106)

## Data Cleaning

In [6]:
# Subset data to G >= 27, MP_tot >= 750
data = data[(data.G >= 27) & (data.MP_tot >= 750)]

In [7]:
tot_list = data.columns[8:30]

In [8]:
per_100p_list = data.columns[-20:-3]

In [9]:
drop_cols = [] # Store columns to drop
for col in tot_list: # Drop season totals
    drop_cols.append(col)
    
for col in per_100p_list: # Drop per 100 possessions stats
    drop_cols.append(col)

The extra data was making the player query inaccurate from what I could tell.

In [10]:
# Drop unwanted columns
data.drop(drop_cols, axis = 1, inplace=True)

In [11]:
# Reset indices for querying later
data = data.reset_index()

## Data Preprocessing

In [13]:
# Assign feature and target values
# Exclude things that are labels and not useful statistics
x = data.drop(['Season', 'Player', 'Pos', 'Tm', 'G', 'GS', 'Rounded_Pos'], axis = 1)
y = data['Pos']

In [14]:
x.columns.values

array(['index', 'Age', 'MP_tot', 'MP_per_G', 'FG_per_G', 'FGA_per_G',
       '3P_per_G', '3PA_per_G', '2P_per_G', '2PA_per_G', 'FT_per_G',
       'FTA_per_G', 'ORB_per_G', 'DRB_per_G', 'TRB_per_G', 'AST_per_G',
       'STL_per_G', 'BLK_per_G', 'TOV_per_G', 'PF_per_G', 'PTS_per_G',
       'FG_per_36m', 'FGA_per_36m', '3P_per_36m', '3PA_per_36m',
       '2P_per_36m', '2PA_per_36m', 'FT_per_36m', 'FTA_per_36m',
       'ORB_per_36m', 'DRB_per_36m', 'TRB_per_36m', 'AST_per_36m',
       'STL_per_36m', 'BLK_per_36m', 'TOV_per_36m', 'PF_per_36m',
       'PTS_per_36m', 'PER', 'TS_perc', '3PAr', 'FTr', 'ORB_perc',
       'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc', 'BLK_perc',
       'TOV_perc', 'USG_perc', 'OWS', 'DWS', 'WS', 'WS_per_48', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'MP', 'ORtg', 'DRtg'], dtype=object)

In [15]:
# Scale data for dimensionality reduction
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

## Search with KDTree

In [16]:
# Use KDTree to find nearest neighbors
tree = KDTree(x_scaled, leaf_size=40, metric='euclidean')
dist, ind = tree.query([x_scaled[5632]], k = 30)
print(ind)
print(dist)

[[5632 5915 5553 5460 4846 5182 6645 5121 8491 4869 5129 5730 1712 6404
  9018 8464 3055 7758 6774 7804 9000 6211 9084 8580 8773 8887 5056 6864
  6478 8199]]
[[ 0.          2.4217271   3.17640459  3.29161682  3.47161345  3.47753246
   3.56638524  3.57915223  3.6026154   3.64279319  3.65137503  3.67174047
   3.69002173  3.75228337  3.76896113  3.78835012  3.84860257  3.88219377
   3.91786142  3.91816122  3.92752207  3.94632954  4.00130952  4.02105461
   4.10328949  4.18792061  4.19275674  4.24449254  4.25133234  4.25596283]]


In [17]:
ind

array([[5632, 5915, 5553, 5460, 4846, 5182, 6645, 5121, 8491, 4869, 5129,
        5730, 1712, 6404, 9018, 8464, 3055, 7758, 6774, 7804, 9000, 6211,
        9084, 8580, 8773, 8887, 5056, 6864, 6478, 8199]])

In [18]:
subset_recent_players = ind[0] > 7745

In [19]:
recent_players = ind[0][subset_recent_players]

In [20]:
recent_players

array([8491, 9018, 8464, 7758, 7804, 9000, 9084, 8580, 8773, 8887, 8199])

In [21]:
keep_players = recent_players[:3]

In [22]:
keep_players

array([8491, 9018, 8464])

In [23]:
data[['Season', 'Player']].iloc[7758]

Season          2010-11
Player    D.J. Augustin
Name: 7758, dtype: object

In [24]:
for i in keep_players:
    print(i)

8491
9018
8464


In [25]:
data.iloc[7758]['Player']

'D.J. Augustin'

* Sort numpy array in reverse order then take the first 3 entries for indices of 3 most similar players
* Also remember to take > 7745 for > 2010-11 season

In [26]:
# Queries KDTree for an array of 30 similar players 
def kdtree_search(input_data, player_index):
    tree = KDTree(input_data, leaf_size=40, metric='euclidean')
    dist, ind = tree.query([input_data[player_index]], k = 30)
    return ind # Return array of 30 similar players

In [27]:
# Get 3 similar players in array of indices
def get_three_players(ind):
    # Gets 3 recent, similar players in order of shortest distance
    subset_recent_players = ind[0] >= 7745 # Get players after 2010-11 season
    recent_players = ind[0][subset_recent_players]
    # Get first three players in array
    return recent_players[:3]

In [28]:
def get_player_indices(df, target_player, target_season):
    return df.index[(df['Player'] == target_player) & (df['Season'] == season)][0]

In [29]:
def similar_players(df, array):
    data = []
    for i in array:
        player_name = df['Player'].iloc[i]
        player_season = df['Season'].iloc[i]
        data.append(tuple((player_name, player_season)))
    # Return list of tuples including player name and season
    return data

In [30]:
def search_for_similar_players(df, target_player, target_season, storage_dict):
    idx = get_player_indices(df, target_player, target_season)
    array = kdtree_search(x_scaled, idx)
    similar_player_indices = get_three_players(array)
    # Store to storage_dict
    storage_dict[target_player] = similar_players(df, similar_player_indices)

## Gathering Similar Players for Two Championship Teams

In [31]:
"""
1. Create roster list of 2003 Spurs minus Tim Duncan.
2. Create another roster list of the 2011 Mavericks minus Dirk.
3. These are the major contributing players for the majority of the playoffs. 
"""

spurs_roster_list = ['Tony Parker', 'David Robinson', 'Stephen Jackson', 'Manu Ginobili', \
              'Malik Rose', 'Speedy Claxton', 'Bruce Bowen']

mavs_roster_list = ['Jason Terry', 'Shawn Marion', 'Tyson Chandler', 'J.J. Barea', \
                   'Jason Kidd', 'DeShawn Stevenson', 'Peja Stojakovic']

spurs_season = '2002-03'
mavs_season = '2010-11'

# Create an empty dictionary with roster list
similar_players_spurs = dict((el,None) for el in spurs_roster_list)
similar_players_mavs = dict((el, None) for el in mavs_roster_list)