In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans 
fulldataset = pd.read_csv('2018-19_OHL_Skaters.csv')



Now, we will select which columns of the data we would like to keep for the model.

We will select Sh%, xG/G, A1/G, HD S (shooting percentage, expected goals per game, primary assists per game and high danger shots)

But, first we will remove players who played fewer than 30 games.
The original indexes will be kept so that we can find the players' names using the fulldataset dataframe.

In [None]:
df_over30gpoldindex = fulldataset.loc[fulldataset['GP'] > 30]
df_over30gp = df_over30gpoldindex.reset_index(drop = True)
df_withname = df_over30gp.loc[:, ['Name','Team','Sh%','A1/GP','xG/GP','HD Sh']]
df = df_withname.iloc[:,2:]
df_withname

We will not use the 'Elbow Method' to find the ideal number (k) of clusters. This is done by calculating the WCSS (within cluster
squared sum) for k from 1 to 10. The ideal k is the first value for which is there not a significant decrease in WCSS relative to
k+1. This is because at this k, the data is "tight" but we are not distinuishing between clusters too much.

In [None]:
wcss = []    
for i in range (1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10)
    kmeans.fit(df)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('Number of clusters')
plt.ylabel('WCSS')
plt.show()


Based on the above graph, it is apparent that either 3 or 4 clusters should be created. Let's first try with three.

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init=10, random_state = 0)
y_kmeans3 = kmeans.fit_predict(df)



In [None]:
y_kmeans3df = pd.DataFrame(y_kmeans3)
cluster3df = pd.concat([df_withname.iloc[:,0:2], df, y_kmeans3df], axis = 1)
cluster3df.columns = ['Name','Team','Sh%', 'A1/GP', 'xG/GP', 'HD Sh', 'Cluster']
cluster3df

# update the indexes for the df being used, and in the fulldataset with games > 30

Now, let's try again with k = 4.

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init=10, random_state = 0)
y_kmeans4 = kmeans.fit_predict(df)
y_kmeans4df = pd.DataFrame(y_kmeans4)
cluster4df = pd.concat([df_withname.iloc[:,0:2], df, y_kmeans4df], axis = 1)
cluster4df.columns = ['Name','Team','Sh%', 'A1/GP', 'xG/GP', 'HD Sh', 'Cluster']
cluster4df




Now, let's make a dataframe with the cluster of the Ottawa 67s.

In [None]:
ott67s = cluster4df[cluster4df['Team']=='OTT']
ott67s