In [0]:
#Importing all the required libraries.
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName('cricket').getOrCreate()	

In [0]:
#Loading the batting dataset which has all the data for all the batsmen who have played IPL from 2008 to 2021.
bat_fp = spark.read.csv('/FileStore/tables/batting.csv', header=True, inferSchema=True)
bat_cols = ['Ave','SR']
bat_fp.printSchema()

root
 |-- Player: string (nullable = true)
 |-- Span: string (nullable = true)
 |-- Mat: integer (nullable = true)
 |-- Inns: integer (nullable = true)
 |-- NO: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- HS: string (nullable = true)
 |-- Ave: double (nullable = true)
 |-- BF: integer (nullable = true)
 |-- SR: double (nullable = true)
 |-- 100: integer (nullable = true)
 |-- 50: integer (nullable = true)
 |-- 0: integer (nullable = true)
 |-- 4s: integer (nullable = true)
 |-- 6s: integer (nullable = true)



In [0]:
#Loading the bowling dataset which has all the data for all the bowlers who have played IPL from 2008 to 2021.
bowl_fp = spark.read.csv('/FileStore/tables/bowling.csv', header=True, inferSchema=True)
bowl_cols = ['Econ', 'SR']
bowl_fp.printSchema()

root
 |-- Player: string (nullable = true)
 |-- Span: string (nullable = true)
 |-- Mat: integer (nullable = true)
 |-- Inns: integer (nullable = true)
 |-- Overs: double (nullable = true)
 |-- Mdns: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- Wkts: integer (nullable = true)
 |-- BBI: string (nullable = true)
 |-- Ave: double (nullable = true)
 |-- Econ: double (nullable = true)
 |-- SR: double (nullable = true)
 |-- 4: integer (nullable = true)
 |-- 5: integer (nullable = true)
 |-- Ct: integer (nullable = true)
 |-- St: integer (nullable = true)



In [0]:
#Function for clustering players into one of the 10 clusters, using K-means clustering algorithm. 
def clustering(fp, cols):
	assembler = VectorAssembler(inputCols=cols, outputCol='features')
	assembled_data = assembler.transform(fp)
	scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
	scaler_model = scaler.fit(assembled_data)
	scaled_data = scaler_model.transform(assembled_data)

	k_means = KMeans(featuresCol='scaledFeatures', k=10)
	model = k_means.fit(scaled_data)
	model_data = model.transform(scaled_data)
	return model_data

In [0]:
#Clustering batsmen into one of the 10 clusters.
bat_det = clustering(bat_fp, bat_cols)
print("\n~~~~~~~~~~~~~~~~~~~ BATSMEN CLUSTER Details ~~~~~~~~~~~~~~~~~~~\n")
bat_det.show(10)


~~~~~~~~~~~~~~~~~~~ BATSMEN CLUSTER Details ~~~~~~~~~~~~~~~~~~~

+---------------+---------+---+----+---+----+---+-------+----+------------------+---+---+---+---+---+--------------------+--------------------+----------+
|         Player|     Span|Mat|Inns| NO|Runs| HS|    Ave|  BF|                SR|100| 50|  0| 4s| 6s|            features|      scaledFeatures|prediction|
+---------------+---------+---+----+---+----+---+-------+----+------------------+---+---+---+---+---+--------------------+--------------------+----------+
|     P Amarnath|200802008|  6|   0|  0|   0|  0|    0.0|   0|               0.0|  0|  0|  0|  0|  0|           (2,[],[])|           (2,[],[])|         1|
|         MM Ali|202102021| 34|  32|  3| 666| 58|  23.05| 455|            147.88|  0|  4|  3| 52| 42|      [23.05,147.88]|[2.05369077724097...|         9|
|     S Anirudha|200802013| 20|  13|  5| 136| 64|   11.0| 113|             91.57|  0|  1|  2|  9|  7|        [11.0,91.57]|[0.98006935139482...|         8|
|KB 

In [0]:
#Counting number of batsmen in each cluster.
bat_det.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   77|
|         6|   74|
|         3|   73|
|         5|   91|
|         9|   41|
|         4|   50|
|         8|   74|
|         7|   14|
|         2|   39|
|         0|   80|
+----------+-----+



In [0]:
#Clustering bowlers into one of the 10 clusters.
bowl_det = clustering(bowl_fp, bowl_cols)
print("\n~~~~~~~~~~~~~~~~~~~ BOWLER CLUSTER Details ~~~~~~~~~~~~~~~~~~~\n")
bowl_det.show(10)


~~~~~~~~~~~~~~~~~~~ BOWLER CLUSTER Details ~~~~~~~~~~~~~~~~~~~

+---------------+---------+---+----+-----------------+----+----+----+----+------------------+-------+------------------+---+---+---+---+--------------------+--------------------+----------+
|         Player|     Span|Mat|Inns|            Overs|Mdns|Runs|Wkts| BBI|               Ave|   Econ|                SR|  4|  5| Ct| St|            features|      scaledFeatures|prediction|
+---------------+---------+---+----+-----------------+----+----+----+----+------------------+-------+------------------+---+---+---+---+--------------------+--------------------+----------+
|     P Amarnath|200802008|  6|   6|             22.0|   0| 236|   7|2/29|             33.71|  10.72|              18.8|  0|  0|  2|  0|        [10.72,18.8]|[2.46217541390129...|         9|
|         MM Ali|202102021| 34|  27|             68.3|   0| 469|  16| 3/7|28.814999999999998|   6.74|              25.6|  0|  0| 11|  0|         [6.74,25.6]|[1.54804685538197.

In [0]:
#Counting number of bowlers in each cluster.
bowl_det.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  182|
|         6|   34|
|         3|   36|
|         5|   10|
|         9|   64|
|         4|   11|
|         8|   46|
|         7|   72|
|         2|   25|
|         0|  133|
+----------+-----+

