In [18]:
# importing necessary libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot  as plt

In [19]:
# importing dataset....
keeper_odi = pd.read_csv('wicketkeeper data odi.csv')
keeper_odi

Unnamed: 0,Player Name,Country,Time Period,Matches,Played,Dismissals,Catches,Stumpings,Maximum Dismissals
0,MS Dhoni,India,2008-2018,19,19,36,25,11,5
1,KC Sangakkara,Sri Lanka,2004-2014,24,24,36,27,9,4
2,Moin Khan,Pakistan,1995-2004,14,13,17,12,5,3
3,Mushfiqur Rahim,Bangladesh,2008-2018,21,17,17,14,3,4
4,DSBP Kuruppu,Sri Lanka,1984-1988,9,9,14,12,2,4
5,RS Kaluwitharana,Sri Lanka,1995-2000,8,8,11,9,2,5
6,Khaled Mashud,Bangladesh,1995-2004,14,14,11,8,3,3
7,Sarfaraz Ahmed,Pakistan,2008-2018,13,13,10,8,2,3
8,Zulqarnain,Pakistan,1986-1986,3,3,9,8,1,4
9,R Dravid,India,1997-2004,13,5,7,5,2,2


In [20]:
features = ['Player Name', 'Catches','Stumpings','Maximum Dismissals']
keeper_odi = keeper_odi.dropna(subset=features) # remove rows not dont have numerical value from the features.
keeper_odi = keeper_odi[features].copy()
keeper_odi

Unnamed: 0,Player Name,Catches,Stumpings,Maximum Dismissals
0,MS Dhoni,25,11,5
1,KC Sangakkara,27,9,4
2,Moin Khan,12,5,3
3,Mushfiqur Rahim,14,3,4
4,DSBP Kuruppu,12,2,4
5,RS Kaluwitharana,9,2,5
6,Khaled Mashud,8,3,3
7,Sarfaraz Ahmed,8,2,3
8,Zulqarnain,8,1,4
9,R Dravid,5,2,2


In [21]:
# scaling the data.
scaler = StandardScaler()
features = ['Catches','Stumpings','Maximum Dismissals']
scaled_data = pd.DataFrame( scaler.fit_transform(keeper_odi[features]) , columns = features )
scaled_data

Unnamed: 0,Catches,Stumpings,Maximum Dismissals
0,3.382973,4.197471,2.350599
1,3.718752,3.316112,1.518528
2,1.20041,1.553395,0.686458
3,1.536189,0.672036,1.518528
4,1.20041,0.231357,1.518528
5,0.696741,0.231357,2.350599
6,0.528852,0.672036,0.686458
7,0.528852,0.231357,0.686458
8,0.528852,-0.209323,1.518528
9,0.025183,0.231357,-0.145612


In [22]:
''' To find how many clusers are to be formed '''
def elbow_plot( min_k, max_k, k_max_iter):
    sum_squared_distances = []
    k_range = range(min_k,max_k+1)
    for k in k_range:
        km = KMeans(n_clusters=k, max_iter=k_max_iter, n_init=10)
        km.fit(scaled_data)
        sum_squared_distances.append(km.inertia_)
        
    # Plot the score for each value of k
    plt.plot(k_range, sum_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [23]:
# elbow_plot(2,12,15)

In [24]:
# Clustering the players using k-means algorithm.
km = KMeans(n_clusters=3,n_init=50)
scaled_data['Cluster'] = km.fit_predict(scaled_data) # assigning the cluster number for each datapoint in the dataframe.
scaled_data



Unnamed: 0,Catches,Stumpings,Maximum Dismissals,Cluster
0,3.382973,4.197471,2.350599,1
1,3.718752,3.316112,1.518528,1
2,1.20041,1.553395,0.686458,2
3,1.536189,0.672036,1.518528,2
4,1.20041,0.231357,1.518528,2
5,0.696741,0.231357,2.350599,2
6,0.528852,0.672036,0.686458,2
7,0.528852,0.231357,0.686458,2
8,0.528852,-0.209323,1.518528,2
9,0.025183,0.231357,-0.145612,0


In [25]:
scaled_data.insert(0,'Player Name',keeper_odi['Player Name']) # adding player name to the cluster...
scaled_data

Unnamed: 0,Player Name,Catches,Stumpings,Maximum Dismissals,Cluster
0,MS Dhoni,3.382973,4.197471,2.350599,1
1,KC Sangakkara,3.718752,3.316112,1.518528,1
2,Moin Khan,1.20041,1.553395,0.686458,2
3,Mushfiqur Rahim,1.536189,0.672036,1.518528,2
4,DSBP Kuruppu,1.20041,0.231357,1.518528,2
5,RS Kaluwitharana,0.696741,0.231357,2.350599,2
6,Khaled Mashud,0.528852,0.672036,0.686458,2
7,Sarfaraz Ahmed,0.528852,0.231357,0.686458,2
8,Zulqarnain,0.528852,-0.209323,1.518528,2
9,R Dravid,0.025183,0.231357,-0.145612,0


In [26]:
# spliting the dataframe into diffrent clusters.
b0 = scaled_data.loc[scaled_data['Cluster']==0]
b1 = scaled_data.loc[scaled_data['Cluster']==1]
b2 = scaled_data.loc[scaled_data['Cluster']==2]

In [27]:
len(b0)

28

In [28]:
len(b1)

2

In [29]:
len(b2)

10