In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
#cleaning data
df = pd.read_csv('NHL 2017-2018.csv')
df.head(5)
df = df.drop(columns=['Unnamed: 0'])

#min games played
df = df[df.GP >= 40]

df.head()

In [None]:
#new dataframe
avg_df = df[['Player','Team','Position','GP','Goals','Total Assists','Shots',
             'Rebounds Created','Penalties Drawn','Takeaways','Hits','Shots Blocked',]].copy()

#renaming
avg_df.rename(columns={'Total Assists':'Assists'}, inplace=True)
avg_df.rename(columns={'Rebounds Created':'Rebounds'}, inplace=True)
avg_df.rename(columns={'Penalties Drawn':'Drawn'}, inplace=True)
avg_df.rename(columns={'Shots Blocked':'Blocks'}, inplace=True)

#stats you want to cluster by
stats = ['Goals','Assists','Shots','Rebounds','Drawn','Takeaways','Hits','Blocks']

avg_df.head()

In [None]:
#averaging stats
for stat in stats:
    avg_df[str(stat)+'/GP'] = avg_df[stat]/avg_df['GP']
    
avg_df.head()

In [None]:
# clustering algorithm
k_means = KMeans(init='k-means++', n_clusters=6, n_init=100)
k_means.fit(preprocessing.scale(avg_df[stats]))

In [None]:
# add cluster to df
avg_df['cluster'] = k_means.labels_

# get cluster centroids
centers = k_means.cluster_centers_
center_df = pd.DataFrame(centers, columns=stats)

# get the mean values for each stat
means = []
for col in range(centers.shape[1]):
    means.append(centers[:,col].mean())

# find values relative to mean
rel_values = centers - np.array(means)
rel_df = pd.DataFrame(rel_values, columns=stats)

In [None]:
# print results
print("\nK-Means Clustering of NHL Players:\n")
for i in range(0,len(avg_df.cluster.value_counts())):
    print('Group:', i)
    print('{stat:13}{val:7}{rank}'.format(stat='Stat/GP', val='Value', rank='Rank'))
    for j in range(len(stats)):
        print('{stat:10}: {val:6.2f}  ({rank:.0f}/{of})'.format(stat=stats[j],
                                         rank=rel_df.rank(ascending=False).iloc[i].values[j],
                                         val=rel_df.iloc[i].values[j],
                                         of=len(rel_df)))
    print('\nTypical players:')
    for j in range(5):
        print('-',avg_df[avg_df.cluster==i]['Player'].head().values[j])
    print('--------------------\n')

In [None]:
#dimenional reduction and scatterplot
pca = PCA(n_components=2)
cluster_data_2d = pca.fit_transform(avg_df[stats])
explained_variance = pca.explained_variance_ratio_
cluster_data_2d = pd.DataFrame(cluster_data_2d)
y_kmeans = pd.DataFrame(k_means.labels_)
cluster_data_2d = pd.concat([cluster_data_2d, y_kmeans], axis = 1)
cluster_data_2d.columns = ['PCA 1', 'PCA 2', 'Cluster']

cluster0 = cluster_data_2d[cluster_data_2d['Cluster'] == 0]
cluster1 = cluster_data_2d[cluster_data_2d['Cluster'] == 1]
cluster2 = cluster_data_2d[cluster_data_2d['Cluster'] == 2]
cluster3 = cluster_data_2d[cluster_data_2d['Cluster'] == 3]
cluster4 = cluster_data_2d[cluster_data_2d['Cluster'] == 4]
cluster5 = cluster_data_2d[cluster_data_2d['Cluster'] == 5]

plt.scatter(cluster0.iloc[:, 0], cluster0.iloc[:,1], s = 10, c = 'red', label = 'Cluster 0')
plt.scatter(cluster1.iloc[:, 0], cluster1.iloc[:,1], s = 10, c = 'green', label = 'Cluster 1')
plt.scatter(cluster2.iloc[:, 0], cluster2.iloc[:,1], s = 10, c = 'blue', label = 'Cluster 2')
plt.scatter(cluster3.iloc[:, 0], cluster3.iloc[:,1], s = 10, c = 'yellow', label = 'Cluster 3')
plt.scatter(cluster4.iloc[:, 0], cluster4.iloc[:,1], s = 10, c = 'purple', label = 'Cluster 4')
plt.scatter(cluster5.iloc[:, 0], cluster5.iloc[:,1], s = 10, c = 'orange', label = 'Cluster 5')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('NHL Player Style Clusters')
plt.legend()
plt.show()