In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [2]:
#import dataset with redundant features removed
df_NBA_slim = pd.read_csv('NBA_numeric_slim.csv')
df_NBA_slim.head()

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,...,away_orb_percent,away_opp_e_fg_percent,away_opp_tov_percent,away_opp_drb_percent,away_opp_ft_fga,away_attend,away_attend_g,home_score,away_score,spread
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0,120,117,3
1,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0,90,93,-3
2,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0,105,114,-9
3,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.482,15.1,71.7,0.247,723949.0,17657.0,110,107,3
4,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,31.2,0.461,13.5,73.0,0.207,905116.0,22076.0,68,95,-27


In [3]:
#looking to first cluster matches.  Remove season and targets.
df_data = df_NBA_slim.iloc[:,1:-3]
df_data.head()

Unnamed: 0,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,tov_percent,...,away_x3p_ar,away_ts_percent,away_tov_percent,away_orb_percent,away_opp_e_fg_percent,away_opp_tov_percent,away_opp_drb_percent,away_opp_ft_fga,away_attend,away_attend_g
0,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,...,0.192,0.551,14.7,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0
1,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,...,0.128,0.497,13.3,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0
2,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,...,0.217,0.517,15.5,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0
3,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,...,0.142,0.535,13.8,28.4,0.482,15.1,71.7,0.247,723949.0,17657.0
4,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,...,0.164,0.519,13.3,31.2,0.461,13.5,73.0,0.207,905116.0,22076.0


In [4]:
#z-score normalize the data
ss = StandardScaler()

df_data_ss = ss.fit_transform(df_data)

In [14]:
%%time
#Kmeans cluster the data and calculate silhouette score

for k in range(2, 10):
    km = KMeans(n_clusters=k, n_init='auto')
    km_matches = km.fit(df_data_ss)
    print(f"silhouette_score for {k} clusters =", silhouette_score(df_data_ss, km_matches.labels_))
    
    """silhouette_score for 2 clusters = 0.17104622453900786
silhouette_score for 3 clusters = 0.1229696958970217
silhouette_score for 4 clusters = 0.11447275418101997
silhouette_score for 5 clusters = 0.10059620469656401
silhouette_score for 6 clusters = 0.07884393907442486
silhouette_score for 7 clusters = 0.08429961550744153
silhouette_score for 8 clusters = 0.07473092505116341
silhouette_score for 9 clusters = 0.0708341087260425"""
    
#so, that's pretty awful.  KMeans is not good at data that isn't spherical or that varies in density.
#need to play with this more

silhouette_score for 2 clusters = 0.17104622453900786
silhouette_score for 3 clusters = 0.1229696958970217
silhouette_score for 4 clusters = 0.11447275418101997
silhouette_score for 5 clusters = 0.10059620469656401
silhouette_score for 6 clusters = 0.07884393907442486
silhouette_score for 7 clusters = 0.08429961550744153
silhouette_score for 8 clusters = 0.07473092505116341
silhouette_score for 9 clusters = 0.0708341087260425
CPU times: total: 32.1 s
Wall time: 17.8 s


In [6]:
#I'd like to try clustering teams by end of season statistics instead of matches between teams.
#I need to remove the away team data and scores.  I also need to remove the duplicate teams (one instance of each team for every 
#other team to make matches)

col_names = df_NBA_slim.columns
col_names

Index(['season', 'age', 'w', 'sos', 'o_rtg', 'd_rtg', 'pace', 'f_tr', 'x3p_ar',
       'ts_percent', 'tov_percent', 'orb_percent', 'opp_e_fg_percent',
       'opp_tov_percent', 'opp_drb_percent', 'opp_ft_fga', 'attend',
       'attend_g', 'away_age', 'away_w', 'away_sos', 'away_o_rtg',
       'away_d_rtg', 'away_pace', 'away_f_tr', 'away_x3p_ar',
       'away_ts_percent', 'away_tov_percent', 'away_orb_percent',
       'away_opp_e_fg_percent', 'away_opp_tov_percent', 'away_opp_drb_percent',
       'away_opp_ft_fga', 'away_attend', 'away_attend_g', 'home_score',
       'away_score', 'spread'],
      dtype='object')

In [8]:
just_home = df_NBA_slim.drop(columns=['away_age', 'away_w', 'away_sos', 'away_o_rtg',
       'away_d_rtg', 'away_pace', 'away_f_tr', 'away_x3p_ar',
       'away_ts_percent', 'away_tov_percent', 'away_orb_percent',
       'away_opp_e_fg_percent', 'away_opp_tov_percent', 'away_opp_drb_percent',
       'away_opp_ft_fga', 'away_attend', 'away_attend_g', 'home_score',
       'away_score', 'spread'])
just_home

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,tov_percent,orb_percent,opp_e_fg_percent,opp_tov_percent,opp_drb_percent,opp_ft_fga,attend,attend_g
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
1,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
2,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
3,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
4,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11481,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0
11482,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0
11483,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0
11484,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0


In [9]:
just_home_no_dup = just_home.drop_duplicates()
just_home_no_dup

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,tov_percent,orb_percent,opp_e_fg_percent,opp_tov_percent,opp_drb_percent,opp_ft_fga,attend,attend_g
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
24,2005,27.1,45.0,-0.52,107.5,106.6,93.3,0.357,0.192,0.551,14.7,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0
50,2005,24.9,18.0,-0.16,101.1,107.5,92.3,0.306,0.128,0.497,13.3,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0
75,2005,25.0,47.0,-0.41,101.4,100.3,92.4,0.311,0.217,0.517,15.5,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0
100,2005,26.5,42.0,-0.52,106.6,105.7,89.7,0.325,0.135,0.518,13.0,32.5,0.485,14.1,71.5,0.248,784249.0,19128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11411,2020,27.0,31.0,0.46,110.2,112.2,98.9,0.230,0.395,0.566,13.0,21.9,0.543,13.6,78.4,0.225,520663.0,16796.0
11428,2020,27.6,32.0,0.46,112.4,113.5,100.5,0.262,0.318,0.572,11.2,20.1,0.542,11.7,79.2,0.193,550515.0,18351.0
11440,2020,26.6,53.0,-0.26,111.1,105.0,100.9,0.264,0.421,0.574,13.1,21.3,0.502,14.6,76.7,0.202,633456.0,19796.0
11454,2020,27.3,44.0,0.05,112.3,109.9,98.6,0.268,0.414,0.585,13.7,21.6,0.518,11.1,78.9,0.185,567486.0,18306.0


In [10]:
#drop the season
just_home_no_dup1 = just_home_no_dup.drop(columns=['season'])
just_home_no_dup1

Unnamed: 0,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,tov_percent,orb_percent,opp_e_fg_percent,opp_tov_percent,opp_drb_percent,opp_ft_fga,attend,attend_g
0,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0
24,27.1,45.0,-0.52,107.5,106.6,93.3,0.357,0.192,0.551,14.7,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0
50,24.9,18.0,-0.16,101.1,107.5,92.3,0.306,0.128,0.497,13.3,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0
75,25.0,47.0,-0.41,101.4,100.3,92.4,0.311,0.217,0.517,15.5,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0
100,26.5,42.0,-0.52,106.6,105.7,89.7,0.325,0.135,0.518,13.0,32.5,0.485,14.1,71.5,0.248,784249.0,19128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11411,27.0,31.0,0.46,110.2,112.2,98.9,0.230,0.395,0.566,13.0,21.9,0.543,13.6,78.4,0.225,520663.0,16796.0
11428,27.6,32.0,0.46,112.4,113.5,100.5,0.262,0.318,0.572,11.2,20.1,0.542,11.7,79.2,0.193,550515.0,18351.0
11440,26.6,53.0,-0.26,111.1,105.0,100.9,0.264,0.421,0.574,13.1,21.3,0.502,14.6,76.7,0.202,633456.0,19796.0
11454,27.3,44.0,0.05,112.3,109.9,98.6,0.268,0.414,0.585,13.7,21.6,0.518,11.1,78.9,0.185,567486.0,18306.0


In [17]:
#z-score normalize
ss_teams = StandardScaler()
just_home_ss = ss.fit_transform(just_home_no_dup1)
just_home_ss.shape

(480, 17)

In [15]:
%%time
#try different k values for clustering team data

for k in range(2, 10):
    km = KMeans(n_clusters=k, n_init='auto')
    km_matches = km.fit(just_home_ss)
    print(f"silhouette_score for {k} clusters =", silhouette_score(just_home_ss, km_matches.labels_))



silhouette_score for 2 clusters = 0.18210347101148452




silhouette_score for 3 clusters = 0.17529351527676482




silhouette_score for 4 clusters = 0.13509446678288323




silhouette_score for 5 clusters = 0.12191183755646577




silhouette_score for 6 clusters = 0.12282446213776627




silhouette_score for 7 clusters = 0.10934712234546874




silhouette_score for 8 clusters = 0.10786565475338984
silhouette_score for 9 clusters = 0.09945832499601444
CPU times: total: 9.33 s
Wall time: 2.19 s


