In [22]:
# WHAT DO WE WANT FROM THIS NOTEBOOK:
# - See distributions for different metrics
# - See averages and deviations for different metrics
# - See distributions for different team subsets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Get NBA data
playerData = pd.read_csv('NBAPlayerData/Players.csv')
playerStats = pd.read_csv('NBAPlayerData/Seasons_Stats.csv')

playerStats.rename(columns={'G':"gamesPlayed"}, inplace=True)

# Get the players that play since 2000
playerStats = playerStats[playerStats['Year']>=2000]
# Only look at players who have played more than 20 games
playerStats = playerStats[playerStats["gamesPlayed"]>30]
# Only look at players who have played more than minutes
playerStats = playerStats[playerStats["MP"]>300]
playerData = playerData[playerData['Player'].isin(playerStats['Player'])]

# Merge the datasets
playerStats = playerStats.merge(playerData, on="Player")

# Make some of the columns we need
playerStats['PPG'] = playerStats['PTS']/playerStats['gamesPlayed']
playerStats['PPM'] = playerStats['PTS']/playerStats['MP']
playerStats['AST_per_TOV'] = [playerStats['AST'].iloc[i]/playerStats['TOV'].iloc[i] if playerStats['TOV'].iloc[i]!=0 else np.nan for i in range(len(playerStats))]
playerStats['MP_per_PF'] = [playerStats["PF"].iloc[i]/playerStats["MP"].iloc[i] if playerStats['MP'].iloc[i]!=0 else np.nan for i in range(len(playerStats))]
playerStats['FTA_per_FGA'] = [playerStats["FTA"].iloc[i]/playerStats["FGA"].iloc[i] if playerStats['FGA'].iloc[i]!=0 else np.nan for i in range(len(playerStats))]
playerStats['MP_per_3PA'] = [playerStats["MP"].iloc[i]/playerStats["3PA"].iloc[i] if playerStats['3PA'].iloc[i]!=0 else np.nan for i in range(len(playerStats))]
playerStats['PTS_per_FGA'] = [playerStats["PTS"].iloc[i]/playerStats["FGA"].iloc[i] if playerStats['FGA'].iloc[i]!=0 else np.nan for i in range(len(playerStats))]

# Rename some columns
playerStats.rename(columns={"weight":"WEIGHT", "height":"HEIGHT"}, inplace=True)

# Fix some team names
playerStats.replace("NJN", "BRK", inplace=True)
playerStats.replace("NOH", "NOP", inplace=True)
playerStats.replace("CHA", "CHO", inplace=True)
playerStats.replace("CHH", "CHO", inplace=True)
playerStats.replace("VAN", "MEM", inplace=True)
playerStats.replace("SEA", "OKC", inplace=True)
playerStats.replace("NOK", "NOP", inplace=True)

# Drop players that got traded that season
playerStats = playerStats[playerStats.Tm!='TOT']

# Create the position columns
playerStats["G"] = [1 if "G" in playerStats['Pos'].iloc[i] else 0 for i in range(len(playerStats))]
playerStats["F"] = [1 if "F" in playerStats['Pos'].iloc[i] else 0 for i in range(len(playerStats))]
playerStats["C"] = [1 if "C" in playerStats['Pos'].iloc[i] else 0 for i in range(len(playerStats))]

# Fill in missing values
playerStats["3P%"] = playerStats['3P%'].fillna(0)
playerStats["FT%"] = playerStats['FT%'].fillna(0)
playerStats["MP_per_3PA"] = playerStats['MP_per_3PA'].fillna(np.mean(playerStats['MP_per_3PA']))
impute_mean_cols = ['AST_per_TOV', 'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA']
# for col in impute_mean_cols:
#     playerStats[col] = playerStats.fillna(np.mean(playerStats.loc[playerStats[col].isnull()==False]))[col]


# GET THE DRAFT DATA
data = pd.read_pickle("../Data/final_data1.df")
data.replace("New Jersey Nets", "Brooklyn Nets", inplace=True)
data.replace("New Orleans Hornets", "New Orleans Pelicans", inplace=True)
data.replace("Charlotte Bobcats", "Charlotte Hornets", inplace=True)
data.replace("LA Clippers", "Los Angeles Clippers", inplace=True)
data.replace("NJN", "BRK", inplace=True)
data.replace("NOH", "NOP", inplace=True)
data.replace("CHA", "CHO", inplace=True)
data.replace("CHH", "CHO", inplace=True)
data.replace("VAN", "MEM", inplace=True)
data.replace("SEA", "OKC", inplace=True)
data.replace("NOK", "NOP", inplace=True)

data.rename(columns={"Name":"Player", "Guard":"G", "Center":"C", "Forward":"F", 
                     "awardCount":"awards", "mock1":"m1", "mock2":"m2", "mock3":"m3", "mock4":"m4",
                     "mock5":"m5", "mock6":"m6"}, inplace=True)
data = data.drop_duplicates(subset=["Player"])
data["WM"] = [(data["EWA"].iloc[i]+data["WP"].iloc[i] + data["WS"].iloc[i])/3. for i in range(len(data))]
data.dropna(subset=["WM"], inplace=True)
data.reset_index(drop=True, inplace=True)
draftData = data.copy()

# Make sure all the indexs are legit

In [24]:
# Define the columns we want to cluster on 
simCols = ['FT%', '3P%', 'eFG%', 'ORB%', 'DRB%', 'AST%', 'TOV%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'FTA',
              '3PA', 'PTS', 'PF', 'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA',
              'C', 'F', 'G', 'PPM', 'PPG', 'HEIGHT', 'WEIGHT']

# Standard scale everything
from sklearn.preprocessing import StandardScaler
scaledData = playerStats.copy()
scaler = StandardScaler()
scaledData[simCols] = scaler.fit_transform(scaledData[simCols])

scaledDraft = draftData.copy()
scaler = StandardScaler()
scaledDraft[simCols] = scaler.fit_transform(scaledDraft[simCols])

In [25]:
# NOW WE WILL TRY PCA AND SEE WHAT HAPPENS
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
NBAdec = pca.fit_transform(scaledData[simCols])
pca = PCA(n_components=2)
draftDec = pca.fit_transform(scaledDraft[simCols])

In [58]:
# Now we look at some pairs between draft players and teams
from sklearn.metrics import pairwise
figsize = (6,5)
metrics = ['euclidean', "manhattan", "cosine"]

for metric_name in metrics:

    metric = pairwise.distance_metrics()[metric_name] # This is a function 
    distances = np.zeros(len(NBAdec)*len(draftDec))
    distanceIndex = 0 
    for team in scaledData['Tm'].unique():
        for year in scaledData["Year"].unique():
            teamYear_ids = scaledData[(scaledData["Tm"]==team) & (scaledData["Year"]==year)].index
            if len(teamYear_ids)>1:

                # Get a draft player from the same yaer
                draft_ids = scaledDraft[scaledDraft["Year"]==year].index
                for i in range(len(draft_ids)):
                    draftee = draftDec[draft_ids[i]]
                    teamMate_ids = teamYear_ids # COMPUTE IT OVER EVERYONE ON THE TEAM

                    # Compute the metric over the team subset
                    for teamMate_id in teamMate_ids:
                        teamMate = NBAdec[teamMate_id]
#                         dist = metric(draftee[simCols].to_numpy().reshape(1,-1),
#                                       teamMate[simCols].to_numpy().reshape(1,-1)).item()
                        dist = metric(draftee.reshape(1,-1), teamMate.reshape(1,-1))
                        distances[distanceIndex] = dist
                        distanceIndex += 1
        
        print(metric_name, team)
        
    # Normalize the results
    print("Normalizing ...")
    maxDist = np.max(distance)
    distance = [i/maxDist for i in distance]
    
    # PLOT ALL OF THE STUFF FOR EACH METRIC
    
    # Plot the distance means
    plt.figure(figsize=figsize)
    plt.bar(0, np.mean(distance))
    plt.xlabel("Label")
    plt.ylabel("Averages")
    plt.title("{} Averages".format(metric_name))
    plt.show()

    # Plot the standard devations
    plt.figure(figsize=figsize)
    plt.bar(0, np.std(distances))
    plt.xlabel("Label")
    plt.ylabel("Distance Metric STD")
    plt.title("{} Standard Deviations".format(metric_name))
    plt.show()

    # Plot the distributions
    plt.figure(figsize=figsize)
    sns.distplot(distances, hist=False, kde=True, kde_kws={'shade':True, 'linewidth':3}, label="0")
    plt.grid()
    plt.title("{} Normalized Distributions".format(metric_name))
    plt.show()

IndexError: index 6235 is out of bounds for axis 0 with size 6162

In [57]:
len(draftDec)

660

In [42]:
playerStats.iloc[:2].index.values[0]

array([1, 2])

In [52]:
scaledDraft.loc[634]

Pk                        2
Tm                      MEM
Player        stromileswift
Year                   2000
HEIGHT            0.0172724
                  ...      
didCombine            False
C                 -0.593683
F                 -0.746059
G                  -0.50473
WM                      1.7
Name: 634, Length: 70, dtype: object