In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [10]:
#cleaning data
df = pd.read_csv('NHL 2017-2018.csv')
df.head(5)
df = df.drop(columns=['Unnamed: 0'])

#min games played
df = df[df.GP >= 40]

df.head()

Unnamed: 0,Player,Team,Position,GP,TOI,Goals,Total Assists,First Assists,Second Assists,Total Points,...,Misconduct,Penalties Drawn,Giveaways,Takeaways,Hits,Hits Taken,Shots Blocked,Faceoffs Won,Faceoffs Lost,Faceoffs %
0,Connor McDavid,EDM,C,82,1766.8,41,67,39,28,108,...,0,36,67,111,28,118,46,376,533,41.36
1,Claude Giroux,PHI,C,82,1669.8,34,68,33,35,102,...,0,26,44,34,30,54,23,638,450,58.64
2,Nikita Kucherov,T.B,R,80,1585.716667,39,61,33,28,100,...,0,36,79,66,31,92,15,3,2,60.0
3,Evgeni Malkin,PIT,C,78,1481.216667,42,56,32,24,98,...,1,30,73,75,48,98,32,460,592,43.73
4,Nathan MacKinnon,COL,C,74,1472.983333,39,58,38,20,97,...,1,24,41,36,38,74,22,485,672,41.92


In [11]:
#new dataframe
avg_df = df[['Player','Team','Position','GP','Goals','Total Assists','Shots','Rush Attempts',
             'Rebounds Created','Penalties Drawn','Takeaways','Hits','Shots Blocked',]].copy()

#renaming
avg_df.rename(columns={'Total Assists':'Assists'}, inplace=True)
avg_df.rename(columns={'Rush Attempts':'Rush'}, inplace=True)
avg_df.rename(columns={'Rebounds Created':'Rebounds'}, inplace=True)
avg_df.rename(columns={'Penalties Drawn':'Drawn'}, inplace=True)
avg_df.rename(columns={'Shots Blocked':'Blocks'}, inplace=True)

#stats you want to cluster by
stats = ['Goals','Assists','Shots','Rush','Rebounds','Drawn','Takeaways','Hits','Blocks']

avg_df.head()

Unnamed: 0,Player,Team,Position,GP,Goals,Assists,Shots,Rush,Rebounds,Drawn,Takeaways,Hits,Blocks
0,Connor McDavid,EDM,C,82,41,67,275,18,34,36,111,28,46
1,Claude Giroux,PHI,C,82,34,68,193,8,24,26,34,30,23
2,Nikita Kucherov,T.B,R,80,39,61,279,15,15,36,66,31,15
3,Evgeni Malkin,PIT,C,78,42,56,239,5,30,30,75,48,32
4,Nathan MacKinnon,COL,C,74,39,58,284,22,35,24,36,38,22


In [12]:
#averaging stats
for stat in stats:
    avg_df[str(stat)+'/GP'] = avg_df[stat]/avg_df['GP']
    
avg_df.head()

Unnamed: 0,Player,Team,Position,GP,Goals,Assists,Shots,Rush,Rebounds,Drawn,...,Blocks,Goals/GP,Assists/GP,Shots/GP,Rush/GP,Rebounds/GP,Drawn/GP,Takeaways/GP,Hits/GP,Blocks/GP
0,Connor McDavid,EDM,C,82,41,67,275,18,34,36,...,46,0.5,0.817073,3.353659,0.219512,0.414634,0.439024,1.353659,0.341463,0.560976
1,Claude Giroux,PHI,C,82,34,68,193,8,24,26,...,23,0.414634,0.829268,2.353659,0.097561,0.292683,0.317073,0.414634,0.365854,0.280488
2,Nikita Kucherov,T.B,R,80,39,61,279,15,15,36,...,15,0.4875,0.7625,3.4875,0.1875,0.1875,0.45,0.825,0.3875,0.1875
3,Evgeni Malkin,PIT,C,78,42,56,239,5,30,30,...,32,0.538462,0.717949,3.064103,0.064103,0.384615,0.384615,0.961538,0.615385,0.410256
4,Nathan MacKinnon,COL,C,74,39,58,284,22,35,24,...,22,0.527027,0.783784,3.837838,0.297297,0.472973,0.324324,0.486486,0.513514,0.297297


In [13]:
# clustering algorithm
k_means = KMeans(init='k-means++', n_clusters=6, n_init=100)
k_means.fit(preprocessing.scale(avg_df[stats]))

  This is separate from the ipykernel package so we can avoid doing imports until


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=100, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [14]:
# add cluster to df
avg_df['cluster'] = k_means.labels_

# get cluster centroids
centers = k_means.cluster_centers_
center_df = pd.DataFrame(centers, columns=stats)

# get the mean values for each stat
means = []
for col in range(centers.shape[1]):
    means.append(centers[:,col].mean())

# find values relative to mean
rel_values = centers - np.array(means)
rel_df = pd.DataFrame(rel_values, columns=stats)

center_df.head(5)

Unnamed: 0,Goals,Assists,Shots,Rush,Rebounds,Drawn,Takeaways,Hits,Blocks
0,-0.66155,-0.647561,-0.516727,0.004744,-0.519213,0.051422,-0.403534,1.38321,0.838584
1,-0.2595,-0.316153,-0.248711,-0.093505,-0.312337,-0.44563,-0.156649,-0.532345,0.031104
2,0.79096,0.766893,0.789545,0.407177,0.608561,-0.121118,0.802984,-0.46143,0.219383
3,-0.97385,-0.866913,-1.132542,-0.973156,-0.914731,-0.694955,-1.002354,-0.082819,-0.871255
4,0.318177,0.200457,0.404152,0.17958,0.537823,1.283957,0.298463,0.651048,0.110571


In [16]:
# print results
print("\nK-Means Clustering of NHL Players:\n")
for i in range(0,len(avg_df.cluster.value_counts())):
    print('Group:', i)
    print('{stat:13}{val:7}{rank}'.format(stat='Stat/GP', val='Value', rank='Rank'))
    for j in range(len(stats)):
        print('{stat:10}: {val:6.2f}  ({rank:.0f}/{of})'.format(stat=stats[j],
                                         rank=rel_df.rank(ascending=False).iloc[i].values[j],
                                         val=rel_df.iloc[i].values[j],
                                         of=len(rel_df)))
    print('\nTypical players:')
    for j in range(5):
        print('-',avg_df[avg_df.cluster==i]['Player'].head().values[j])
    print('--------------------\n')


K-Means Clustering of NHL Players:

Group: 0
Stat/GP      Value  Rank
Goals     :  -0.82  (5/6)
Assists   :  -0.80  (5/6)
Shots     :  -0.69  (5/6)
Rush      :  -0.13  (4/6)
Rebounds  :  -0.69  (5/6)
Drawn     :  -0.16  (3/6)
Takeaways :  -0.53  (5/6)
Hits      :   1.31  (1/6)
Blocks    :   0.78  (1/6)

Typical players:
- Charlie Coyle
- Justin Abdelkader
- Milan Lucic
- Bobby Ryan
- Nick Foligno
--------------------

Group: 1
Stat/GP      Value  Rank
Goals     :  -0.42  (4/6)
Assists   :  -0.47  (4/6)
Shots     :  -0.42  (4/6)
Rush      :  -0.23  (5/6)
Rebounds  :  -0.48  (4/6)
Drawn     :  -0.65  (5/6)
Takeaways :  -0.28  (4/6)
Hits      :  -0.61  (6/6)
Blocks    :  -0.03  (5/6)

Typical players:
- Patrick Marleau
- Alex Killorn
- Danton Heinen
- David Krejci
- Bo Horvat
--------------------

Group: 2
Stat/GP      Value  Rank
Goals     :   0.63  (2/6)
Assists   :   0.61  (2/6)
Shots     :   0.62  (2/6)
Rush      :   0.27  (2/6)
Rebounds  :   0.44  (2/6)
Drawn     :  -0.33  (4/6)
Tak