In [12]:
# importing necessary libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot  as plt

In [13]:
# importing dataset....
keeper_t20 = pd.read_csv('wicketkeeper data t20i.csv')
keeper_t20

Unnamed: 0,Player Name,Country,Time Period,Matches,Played,Dismissals,Catches,Stumpings,Maximum Dismissals
0,MS Dhoni,India,2016-2016,5,5,7,6,1,3
1,SP Patil,UAE,2016-2016,7,7,7,6,1,2
2,LD Chandimal,Sri Lanka,2016-2016,4,3,4,4,0,2
3,Sultan Ahmed,Oman,2016-2016,3,3,4,2,2,3
4,KD Karthik,India,2022-2022,3,1,3,3,0,3
5,Nurul Hasan,Bangladesh,2016-2016,2,2,3,0,3,2
6,Mohammad Shahzad,Afghanistan,2016-2016,3,3,3,1,2,2
7,Rahmanullah Gurbaz,Afghanistan,2022-2022,5,5,3,3,0,2
8,BKG Mendis,Sri Lanka,2022-2022,6,6,3,3,0,2
9,Mohammad Rizwan,Pakistan,2022-2022,6,6,3,3,0,1


In [14]:
features = ['Player Name', 'Catches','Stumpings','Maximum Dismissals']
keeper_t20 = keeper_t20.dropna(subset=features) # remove rows not dont have numerical value from the features.
keeper_t20 = keeper_t20[features].copy()
keeper_t20

Unnamed: 0,Player Name,Catches,Stumpings,Maximum Dismissals
0,MS Dhoni,6,1,3
1,SP Patil,6,1,2
2,LD Chandimal,4,0,2
3,Sultan Ahmed,2,2,3
4,KD Karthik,3,0,3
5,Nurul Hasan,0,3,2
6,Mohammad Shahzad,1,2,2
7,Rahmanullah Gurbaz,3,0,2
8,BKG Mendis,3,0,2
9,Mohammad Rizwan,3,0,1


In [15]:
# scaling the data.
scaler = StandardScaler()
features = ['Catches','Stumpings','Maximum Dismissals']
scaled_data = pd.DataFrame( scaler.fit_transform(keeper_t20[features]) , columns = features )
scaled_data

Unnamed: 0,Catches,Stumpings,Maximum Dismissals
0,1.994913,0.40452,1.38587
1,1.994913,0.40452,0.377964
2,0.948401,-0.6742,0.377964
3,-0.09811,1.48324,1.38587
4,0.425145,-0.6742,1.38587
5,-1.144622,2.561959,0.377964
6,-0.621366,1.48324,0.377964
7,0.425145,-0.6742,0.377964
8,0.425145,-0.6742,0.377964
9,0.425145,-0.6742,-0.629941


In [16]:
''' To find how many clusers are to be formed '''
def elbow_plot( min_k, max_k, k_max_iter):
    sum_squared_distances = []
    k_range = range(min_k,max_k+1)
    for k in k_range:
        km = KMeans(n_clusters=k, max_iter=k_max_iter, n_init=10)
        km.fit(scaled_data)
        sum_squared_distances.append(km.inertia_)
        
    # Plot the score for each value of k
    plt.plot(k_range, sum_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [17]:
# elbow_plot(2,12,15)

In [18]:
# Clustering the players using k-means algorithm.
km = KMeans(n_clusters=2,n_init=50)
scaled_data['Cluster'] = km.fit_predict(scaled_data) # assigning the cluster number for each datapoint in the dataframe.
scaled_data



Unnamed: 0,Catches,Stumpings,Maximum Dismissals,Cluster
0,1.994913,0.40452,1.38587,1
1,1.994913,0.40452,0.377964,1
2,0.948401,-0.6742,0.377964,1
3,-0.09811,1.48324,1.38587,1
4,0.425145,-0.6742,1.38587,1
5,-1.144622,2.561959,0.377964,1
6,-0.621366,1.48324,0.377964,1
7,0.425145,-0.6742,0.377964,1
8,0.425145,-0.6742,0.377964,1
9,0.425145,-0.6742,-0.629941,0


In [19]:
scaled_data.insert(0,'Player Name',keeper_t20['Player Name']) # adding player name to the cluster...
scaled_data

Unnamed: 0,Player Name,Catches,Stumpings,Maximum Dismissals,Cluster
0,MS Dhoni,1.994913,0.40452,1.38587,1
1,SP Patil,1.994913,0.40452,0.377964,1
2,LD Chandimal,0.948401,-0.6742,0.377964,1
3,Sultan Ahmed,-0.09811,1.48324,1.38587,1
4,KD Karthik,0.425145,-0.6742,1.38587,1
5,Nurul Hasan,-1.144622,2.561959,0.377964,1
6,Mohammad Shahzad,-0.621366,1.48324,0.377964,1
7,Rahmanullah Gurbaz,0.425145,-0.6742,0.377964,1
8,BKG Mendis,0.425145,-0.6742,0.377964,1
9,Mohammad Rizwan,0.425145,-0.6742,-0.629941,0


In [20]:
# spliting the dataframe into diffrent clusters.
b0 = scaled_data.loc[scaled_data['Cluster']==0]
b1 = scaled_data.loc[scaled_data['Cluster']==1]

In [21]:
len(b0)

6

In [22]:
len(b1)

10