In [74]:
# importing necessary libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot  as plt

In [75]:
# importing dataset....
batsman_odi = pd.read_csv('batsman data odi.csv')
batsman_odi.head()

Unnamed: 0,Player Name,Country,Time Period,Matches,Played,Not Outs,Runs,Highest Score,Batting Average,Balls Faced,Strike Rate,Centuries,Fifties,Ducks,Fours,Sixes
0,ST Jayasuriya,Sri Lanka,1990-2008,25,24,1,1220,130,53.04,1190,102.52,6,3,1,139,23
1,KC Sangakkara,Sri Lanka,2004-2014,24,23,1,1075,121,48.86,1272,84.51,4,8,2,107,7
2,SR Tendulkar,India,1990-2012,23,21,2,971,114,51.1,1136,85.47,2,7,0,108,8
3,Shoaib Malik,Pakistan,2000-2018,17,15,3,786,143,65.5,867,90.65,3,3,0,76,8
4,RG Sharma,India,2008-2018,22,21,5,745,111,46.56,877,84.94,1,6,1,60,17


In [76]:
features = ['Player Name','Batting Average','Strike Rate','Runs', 'Not Outs'] # features considered
batsman_odi = batsman_odi.dropna(subset=features) # remove rows not dont have numerical value from the features.
batsman_odi = batsman_odi[features].copy()
batsman_odi

Unnamed: 0,Player Name,Batting Average,Strike Rate,Runs,Not Outs
0,ST Jayasuriya,53.04,102.52,1220,1
1,KC Sangakkara,48.86,84.51,1075,1
2,SR Tendulkar,51.1,85.47,971,2
3,Shoaib Malik,65.5,90.65,786,3
4,RG Sharma,46.56,84.94,745,5
5,A Ranatunga,57.0,82.79,741,6
6,Mushfiqur Rahim,36.78,84.52,699,2
7,DPMD Jayawardene,29.3,87.64,674,3
8,MS Dhoni,64.8,87.68,648,6
9,PA de Silva,32.25,83.44,645,2


In [77]:
# scaling the data.
scaler = StandardScaler()
features = ['Batting Average','Strike Rate','Runs', 'Not Outs']
scaled_data = pd.DataFrame( scaler.fit_transform(batsman_odi[features]) , columns = features )
scaled_data

Unnamed: 0,Batting Average,Strike Rate,Runs,Not Outs
0,0.094416,1.216001,3.428727,-0.588817
1,-0.036744,0.053707,2.750151,-0.588817
2,0.033543,0.115661,2.263448,-0.053529
3,0.485385,0.449958,1.397679,0.481759
4,-0.108913,0.081457,1.205806,1.552334
5,0.218673,-0.057295,1.187087,2.087622
6,-0.415789,0.054352,0.990534,-0.053529
7,-0.650496,0.255705,0.873538,0.481759
8,0.463421,0.258286,0.751862,2.087622
9,-0.557931,-0.015347,0.737823,-0.053529


In [78]:
''' To find how many clusers are to be formed '''
def elbow_plot( min_k, max_k, k_max_iter):
    sum_squared_distances = []
    k_range = range(min_k,max_k+1)
    for k in k_range:
        km = KMeans(n_clusters=k, max_iter=k_max_iter, n_init=10)
        km.fit(scaled_data)
        sum_squared_distances.append(km.inertia_)
        
    # Plot the score for each value of k
    plt.plot(k_range, sum_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [79]:
# elbow_plot(2,12,15)

In [80]:
# Clustering the players using k-means algorithm.
km = KMeans(n_clusters=3,n_init=50)
scaled_data['Cluster'] = km.fit_predict(scaled_data) # assigning the cluster number for each datapoint in the dataframe.
scaled_data



Unnamed: 0,Batting Average,Strike Rate,Runs,Not Outs,Cluster
0,0.094416,1.216001,3.428727,-0.588817,0
1,-0.036744,0.053707,2.750151,-0.588817,0
2,0.033543,0.115661,2.263448,-0.053529,0
3,0.485385,0.449958,1.397679,0.481759,0
4,-0.108913,0.081457,1.205806,1.552334,0
5,0.218673,-0.057295,1.187087,2.087622,0
6,-0.415789,0.054352,0.990534,-0.053529,0
7,-0.650496,0.255705,0.873538,0.481759,0
8,0.463421,0.258286,0.751862,2.087622,0
9,-0.557931,-0.015347,0.737823,-0.053529,0


In [81]:
scaled_data.insert(0,'Player Name',batsman_odi['Player Name']) # adding player name to the cluster...
scaled_data.head(3)

Unnamed: 0,Player Name,Batting Average,Strike Rate,Runs,Not Outs,Cluster
0,ST Jayasuriya,0.094416,1.216001,3.428727,-0.588817,0
1,KC Sangakkara,-0.036744,0.053707,2.750151,-0.588817,0
2,SR Tendulkar,0.033543,0.115661,2.263448,-0.053529,0


In [82]:
# spliting the dataframe into diffrent clusters.
b0 = scaled_data.loc[scaled_data['Cluster']==0]
b1 = scaled_data.loc[scaled_data['Cluster']==1]
b2 = scaled_data.loc[scaled_data['Cluster']==2]

In [83]:
len(b0)

27

In [84]:
len(b1)

22

In [85]:
len(b2)

1