In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [66]:
#cleaning data
df = pd.read_csv('2017-18 OHL Skaters.csv')
df = df.drop(columns=['#'])
stats = ['G/GP','A1/GP','A2/GP','P/GP','Sh/GP']

#min games played
df = df[df.GP >= 30]

df.head(5)

Unnamed: 0,Name,Pos,Team,Age,GP,G,A1,A2,P1,P,...,HD Sh%,MD G,MD Sh,MD Sh%,LD G,LD Sh,LD Sh%,FOW,FOT,FOW%
0,Morgan Frost,C,SSM,18.341,67,42,45,25,87,112,...,42.31,8,27,29.63,23,147,15.65,702,1346,52.15
1,Aaron Luchuk,C,WSR/BAR,20.447,68,50,36,29,86,115,...,30.0,11,42,26.19,21,179,11.73,939,1676,56.03
2,Jordan Kyrou,RW,SAR,19.365,56,39,40,30,79,109,...,26.47,6,26,23.08,24,106,22.64,29,59,49.15
3,Nick Suzuki,C,OS,18.1,64,42,34,24,76,100,...,23.08,15,42,35.71,18,200,9.0,354,673,52.6
4,Dmitry Sokolov,RW,BAR/SBY,19.423,64,50,25,21,75,96,...,40.91,15,48,31.25,26,198,13.13,4,10,40.0


In [76]:
# clustering algorithm
k_means = KMeans(init='k-means++', n_clusters=8, n_init=100)
k_means.fit(preprocessing.scale(df[stats]))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=100, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [77]:
# add cluster to df
df['cluster'] = k_means.labels_

# get cluster centroids
centers = k_means.cluster_centers_
center_df = pd.DataFrame(centers, columns=stats)

# get the mean values for each stat
means = []
for col in range(centers.shape[1]):
    means.append(centers[:,col].mean())

# find values relative to mean
rel_values = centers - np.array(means)
rel_df = pd.DataFrame(rel_values, columns=stats)

center_df.head(5)

Unnamed: 0,G/GP,A1/GP,A2/GP,P/GP,Sh/GP
0,0.030001,-0.231522,-0.50746,-0.21263,-0.08075
1,0.431065,1.241776,2.104119,1.244902,0.616004
2,-0.216896,0.286994,0.767983,0.21745,0.147254
3,-0.820505,-0.842203,-0.877296,-0.94547,-0.953056
4,1.605414,1.331977,0.763018,1.467984,1.683585


In [78]:
# print results
print("\nK-Means Clustering of OHL Players:\n")
for i in range(0,len(df.cluster.value_counts())):
    print('Group:', i)
    print('{stat:11}{val:7}{rank}'.format(stat='Stat', val='Value', rank='Rank'))
    for j in range(len(stats)):
        print('{stat:8}: {val:6.2f}  ({rank:.0f}/{of})'.format(stat=stats[j],
                                         rank=rel_df.rank(ascending=False).iloc[i].values[j],
                                         val=rel_df.iloc[i].values[j],
                                         of=len(rel_df)))
    print('\nTypical players:')
    for j in range(5):
        print('-',df[df.cluster==i]['Name'].head().values[j])
    print('--------------------\n')


K-Means Clustering of OHL Players:

Group: 0
Stat       Value  Rank
G/GP    :  -0.42  (5/8)
A1/GP   :  -0.78  (6/8)
A2/GP   :  -1.07  (7/8)
P/GP    :  -0.79  (6/8)
Sh/GP   :  -0.56  (6/8)

Typical players:
- Brady Hinz
- Cedric Ralph
- Sean Josling
- Franco Sproviero
- Matthew Philip
--------------------

Group: 1
Stat       Value  Rank
G/GP    :  -0.02  (4/8)
A1/GP   :   0.69  (3/8)
A2/GP   :   1.54  (1/8)
P/GP    :   0.67  (3/8)
Sh/GP   :   0.14  (4/8)

Typical players:
- Lucas Chiodo
- Akil Thomas
- Ryan McLeod
- Ryan Merkley
- Will Bitten
--------------------

Group: 2
Stat       Value  Rank
G/GP    :  -0.67  (6/8)
A1/GP   :  -0.26  (5/8)
A2/GP   :   0.20  (3/8)
P/GP    :  -0.36  (5/8)
Sh/GP   :  -0.33  (5/8)

Typical players:
- Jaden Peca
- Ryan Suzuki
- Semyon Der-Arguchintsev
- Damien Giroux
- Troy Lajeunesse
--------------------

Group: 3
Stat       Value  Rank
G/GP    :  -1.27  (8/8)
A1/GP   :  -1.39  (8/8)
A2/GP   :  -1.44  (8/8)
P/GP    :  -1.52  (8/8)
Sh/GP   :  -1.43  (8/

Next Steps:
1. figure out how to sort by distance from centroid and list those players, right now its just random
2. plug in better data, right now it's just point production related
3. make two models for forwards vs defencement
4. Make model using NHL data for CUC