In [20]:
# importing necessary libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot  as plt

In [21]:
# importing dataset....
bowler_odi = pd.read_csv('bowler data odi.csv')
bowler_odi.head()

Unnamed: 0,Player Name,Country,Time Period,Matches,Played,Overs,Maiden Overs,Runs,Wickets,Best Figure,Bowling Average,Economy Rate,Strike Rate,Four Wickets,Five Wickets
0,M Muralidaran,Sri Lanka,1995-2010,24,24,230.2,13,865,30,5/31,28.83,3.75,46.0,1,1
1,SL Malinga,Sri Lanka,2004-2018,14,14,128.1,6,596,29,5/34,20.55,4.65,26.5,1,3
2,BAW Mendis,Sri Lanka,2008-2014,8,8,68.0,5,271,26,6/13,10.42,3.98,15.6,2,2
3,Saeed Ajmal,Pakistan,2008-2014,12,12,115.0,6,485,25,3/26,19.4,4.21,27.6,0,0
4,WPUJC Vaas,Sri Lanka,1995-2008,19,19,152.2,20,639,23,3/30,27.78,4.19,39.7,0,0


In [22]:
features = ['Player Name', 'Wickets','Bowling Average','Economy Rate'] # features considered.
bowler_odi = bowler_odi.dropna(subset=features)
bowler_odi = bowler_odi[features].copy()
bowler_odi

Unnamed: 0,Player Name,Wickets,Bowling Average,Economy Rate
0,M Muralidaran,30,28.83,3.75
1,SL Malinga,29,20.55,4.65
2,BAW Mendis,26,10.42,3.98
3,Saeed Ajmal,25,19.4,4.21
4,WPUJC Vaas,23,27.78,4.19
5,IK Pathan,22,27.5,5.54
6,ST Jayasuriya,22,30.31,4.48
7,Abdur Razzak,22,36.18,4.67
8,RA Jadeja,19,26.57,4.34
9,Shakib Al Hasan,19,32.73,4.87


In [23]:
# scaling the data.
scaler = StandardScaler()
features = ['Wickets','Bowling Average','Economy Rate']
scaled_data = pd.DataFrame( scaler.fit_transform(bowler_odi[features]) , columns = features )
scaled_data

Unnamed: 0,Wickets,Bowling Average,Economy Rate
0,2.710372,0.00277,-1.139769
1,2.53412,-0.734545,0.056103
2,2.005362,-1.636598,-0.834157
3,1.82911,-0.83695,-0.528546
4,1.476604,-0.09073,-0.555121
5,1.300352,-0.115663,1.238687
6,1.300352,0.134561,-0.169784
7,1.300352,0.657271,0.082678
8,0.771594,-0.198478,-0.355809
9,0.771594,0.350057,0.348427


In [24]:
''' To find how many clusers are to be formed '''
def elbow_plot( min_k, max_k, k_max_iter):
    sum_squared_distances = []
    k_range = range(min_k,max_k+1)
    for k in k_range:
        km = KMeans(n_clusters=k, max_iter=k_max_iter, n_init=10)
        km.fit(scaled_data)
        sum_squared_distances.append(km.inertia_)
        
    # Plot the score for each value of k
    plt.plot(k_range, sum_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [25]:
# elbow_plot(2,12,15)

In [26]:
# Clustering the players using k-means algorithm.
km = KMeans(n_clusters=3,n_init=50)
scaled_data['Cluster'] = km.fit_predict(scaled_data) # assigning the cluster number for each datapoint in the dataframe.
scaled_data



Unnamed: 0,Wickets,Bowling Average,Economy Rate,Cluster
0,2.710372,0.00277,-1.139769,2
1,2.53412,-0.734545,0.056103,2
2,2.005362,-1.636598,-0.834157,2
3,1.82911,-0.83695,-0.528546,2
4,1.476604,-0.09073,-0.555121,2
5,1.300352,-0.115663,1.238687,2
6,1.300352,0.134561,-0.169784,2
7,1.300352,0.657271,0.082678,2
8,0.771594,-0.198478,-0.355809,2
9,0.771594,0.350057,0.348427,2


In [27]:
scaled_data.insert(0,'Player Name',bowler_odi['Player Name']) # adding player name to the cluster...
scaled_data.head()

Unnamed: 0,Player Name,Wickets,Bowling Average,Economy Rate,Cluster
0,M Muralidaran,2.710372,0.00277,-1.139769,2
1,SL Malinga,2.53412,-0.734545,0.056103,2
2,BAW Mendis,2.005362,-1.636598,-0.834157,2
3,Saeed Ajmal,1.82911,-0.83695,-0.528546,2
4,WPUJC Vaas,1.476604,-0.09073,-0.555121,2


In [28]:
# spliting the dataframe into diffrent clusters.
b0 = scaled_data.loc[scaled_data['Cluster']==0]
b1 = scaled_data.loc[scaled_data['Cluster']==1]
b2 = scaled_data.loc[scaled_data['Cluster']==2]

In [29]:
len(b0)

21

In [30]:
len(b1)

14

In [31]:
len(b2)

10