In [27]:
# importing necessary libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot  as plt
import seaborn as sns

In [28]:
# importing dataset....
batsman_t20 = pd.read_csv('batsman data t20i.csv')
batsman_t20.head()

Unnamed: 0,Player Name,Country,Time Period,Matches,Played,Not Outs,Runs,Highest Score,Batting Average,Balls Faced,Strike Rate,Centuries,Fifties,Ducks,Fours,Sixes
0,V Kohli,India,2016-2022,10,9,4,429,122,85.8,325,132.0,1,3,1,40,11
1,Mohammad Rizwan,Pakistan,2022-2022,6,6,1,281,78,56.2,239,117.57,0,3,0,21,6
2,RG Sharma,India,2016-2022,9,9,0,271,83,30.11,192,141.14,0,2,1,27,12
3,Babar Hayat,Hong Kong,2016-2022,5,5,0,235,122,47.0,160,146.87,1,1,1,22,10
4,Ibrahim Zadran,Afghanistan,2022-2022,5,5,2,196,64,65.33,188,104.25,0,1,0,14,4


In [29]:
features = ['Player Name','Batting Average','Strike Rate','Runs', 'Not Outs'] # features considered.
batsman_t20 = batsman_t20.dropna(subset=features) # remove rows not dont have numerical value from the features.
batsman_t20 = batsman_t20[features].copy()
batsman_t20

Unnamed: 0,Player Name,Batting Average,Strike Rate,Runs,Not Outs
0,V Kohli,85.80,132.0,429,4
1,Mohammad Rizwan,56.20,117.57,281,1
2,RG Sharma,30.11,141.14,271,0
3,Babar Hayat,47.00,146.87,235,0
4,Ibrahim Zadran,65.33,104.25,196,2
5,PBB Rajapaksa,47.75,149.21,191,2
6,Sabbir Rahman,36.20,122.29,181,1
7,Muhammad Usman,29.33,118.12,176,1
8,Najibullah Zadran,35.20,157.14,176,3
9,P Nissanka,34.60,115.33,173,1


In [30]:
batsman_t20 = batsman_t20.drop(38) # removing incorrect data
batsman_t20

Unnamed: 0,Player Name,Batting Average,Strike Rate,Runs,Not Outs
0,V Kohli,85.8,132.0,429,4
1,Mohammad Rizwan,56.2,117.57,281,1
2,RG Sharma,30.11,141.14,271,0
3,Babar Hayat,47.0,146.87,235,0
4,Ibrahim Zadran,65.33,104.25,196,2
5,PBB Rajapaksa,47.75,149.21,191,2
6,Sabbir Rahman,36.2,122.29,181,1
7,Muhammad Usman,29.33,118.12,176,1
8,Najibullah Zadran,35.2,157.14,176,3
9,P Nissanka,34.6,115.33,173,1


In [31]:
# scaling the data.
scaler = StandardScaler()
features = ['Batting Average','Strike Rate','Runs', 'Not Outs']
scaled_data = pd.DataFrame( scaler.fit_transform(batsman_t20[features]) , columns = features )
scaled_data

Unnamed: 0,Batting Average,Strike Rate,Runs,Not Outs
0,3.387543,0.454926,4.352184,2.96874
1,1.554856,-0.227123,2.210757,0.099622
2,-0.060509,0.886938,2.066066,-0.856751
3,0.985237,1.157773,1.545179,-0.856751
4,2.120141,-0.856707,0.980884,1.055995
5,1.031673,1.268375,0.908539,1.055995
6,0.316554,-0.004027,0.763848,0.099622
7,-0.108803,-0.201127,0.691502,0.099622
8,0.254639,1.643195,0.691502,2.012367
9,0.21749,-0.332999,0.648095,0.099622


In [32]:
''' To find how many clusers are to be formed '''
def elbow_plot( min_k, max_k, k_max_iter):
    sum_squared_distances = []
    k_range = range(min_k,max_k+1)
    for k in k_range:
        km = KMeans(n_clusters=k, max_iter=k_max_iter, n_init=50)
        km.fit(scaled_data)
        sum_squared_distances.append(km.inertia_)
        
    # Plot the score for each value of k
    plt.plot(k_range, sum_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [33]:
# elbow_plot(2,12,50)

In [34]:
# Clustering the players using k-means algorithm.
km = KMeans(n_clusters=3,n_init=50)
scaled_data['Cluster'] = km.fit_predict(scaled_data) # assigning the cluster number for each datapoint in the dataframe.
scaled_data



Unnamed: 0,Batting Average,Strike Rate,Runs,Not Outs,Cluster
0,3.387543,0.454926,4.352184,2.96874,0
1,1.554856,-0.227123,2.210757,0.099622,0
2,-0.060509,0.886938,2.066066,-0.856751,2
3,0.985237,1.157773,1.545179,-0.856751,2
4,2.120141,-0.856707,0.980884,1.055995,0
5,1.031673,1.268375,0.908539,1.055995,0
6,0.316554,-0.004027,0.763848,0.099622,1
7,-0.108803,-0.201127,0.691502,0.099622,1
8,0.254639,1.643195,0.691502,2.012367,0
9,0.21749,-0.332999,0.648095,0.099622,1


In [35]:
scaled_data.insert(0,'Player Name',batsman_t20['Player Name']) # adding player name to the cluster...
scaled_data

Unnamed: 0,Player Name,Batting Average,Strike Rate,Runs,Not Outs,Cluster
0,V Kohli,3.387543,0.454926,4.352184,2.96874,0
1,Mohammad Rizwan,1.554856,-0.227123,2.210757,0.099622,0
2,RG Sharma,-0.060509,0.886938,2.066066,-0.856751,2
3,Babar Hayat,0.985237,1.157773,1.545179,-0.856751,2
4,Ibrahim Zadran,2.120141,-0.856707,0.980884,1.055995,0
5,PBB Rajapaksa,1.031673,1.268375,0.908539,1.055995,0
6,Sabbir Rahman,0.316554,-0.004027,0.763848,0.099622,1
7,Muhammad Usman,-0.108803,-0.201127,0.691502,0.099622,1
8,Najibullah Zadran,0.254639,1.643195,0.691502,2.012367,0
9,P Nissanka,0.21749,-0.332999,0.648095,0.099622,1


In [36]:
# spliting the dataframe into diffrent clusters.
b0 = scaled_data.loc[scaled_data['Cluster']==0]
b1 = scaled_data.loc[scaled_data['Cluster']==1]
b2 = scaled_data.loc[scaled_data['Cluster']==2]

In [37]:
len(b0)

8

In [38]:
len(b1)

29

In [39]:
len(b2)

11