In [47]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


# Then proceed to import and use Spark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()


# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\1_rawData'
batting_data = spark.read.csv(os.path.join(directory, 't20_batting_stats.csv'), header=True, inferSchema=True)
batting_data.show(5)

+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
|              Player|Mat|Inns| NO|Runs| HS| Ave| BF|   SR|100| 50|  0| 4s| 6s| Season|
+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
|Zulqarnain Haider...|  2|   2|  0|  18| 17|9.00| 21|85.71|  0|  0|  0|  1|  0|2010/11|
|Zulqarnain Haider...|  1|   1|  0|   5|  5|5.00|  9|55.55|  0|  0|  0|  0|  0|2006/07|
|Zulqarnain Haider...|  1|   -|  -|   -|  -|   -|  -|    -|  -|  -|  -|  -|  -|2019/20|
|Zulqarnain Haider...|  4|   1|  0|   0|  0|0.00|  0|    -|  0|  0|  1|  0|  0|   2019|
|Zulqarnain Haider...|  6|   3|  1|   8|  4|4.00| 12|66.66|  0|  0|  0|  0|  0|   2022|
+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
only showing top 5 rows



In [48]:
batting_data = batting_data.select("Player","Season","Mat","Inns","Runs","SR","Ave").sort("Player","Season")
batting_data.show(5)

+----------------+-------+---+----+----+------+----+
|          Player| Season|Mat|Inns|Runs|    SR| Ave|
+----------------+-------+---+----+----+------+----+
|A Ahmadhel (BUL)|2019/20|  3|   2|  16|100.00|8.00|
|A Ahmadhel (BUL)|   2020|  1|   1|   8|100.00|   -|
|A Ahmadhel (BUL)|2020/21|  2|   1|   2| 28.57|2.00|
|A Ahmadhel (BUL)|   2021|  3|   3|   5| 38.46|1.66|
|A Ahmadhel (BUL)|   2023|  2|   -|   -|     -|   -|
+----------------+-------+---+----+----+------+----+
only showing top 5 rows



In [49]:
from pyspark.sql.functions import when,col
batting_data = batting_data.withColumn("Inns", when(col("Inns") == "-", "0").otherwise(col("Inns")).cast("int"))
batting_data = batting_data.withColumn("Runs", when(col("Runs") == "-", "0").otherwise(col("Runs")).cast("int"))
batting_data = batting_data.withColumn("SR", when(col("SR") == "-", "0").otherwise(col("SR")).cast("float"))
batting_data = batting_data.withColumn("Ave", when(col("Ave") == "-", col("Runs")/col("Inns")).otherwise(col("Ave")).cast("float")).na.fill(0)
batting_data.show(5)

+----------------+-------+---+----+----+-----+----+
|          Player| Season|Mat|Inns|Runs|   SR| Ave|
+----------------+-------+---+----+----+-----+----+
|A Ahmadhel (BUL)|2019/20|  3|   2|  16|100.0| 8.0|
|A Ahmadhel (BUL)|   2020|  1|   1|   8|100.0| 8.0|
|A Ahmadhel (BUL)|2020/21|  2|   1|   2|28.57| 2.0|
|A Ahmadhel (BUL)|   2021|  3|   3|   5|38.46|1.66|
|A Ahmadhel (BUL)|   2023|  2|   0|   0|  0.0| 0.0|
+----------------+-------+---+----+----+-----+----+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import regexp_extract, regexp_replace

# Extract the country name between '(' and ')'
batting_data = batting_data.withColumn("Country", regexp_extract(col("Player"), r"\((.*?)\)", 1))

# Extract the player's name before the first '('
batting_data = batting_data.withColumn("Player", regexp_extract(col("Player"), r"^(.*?)\s\(", 1))

# Show the result
batting_data.show(5)

DataFrame[Player: string, Season: string, Mat: int, Inns: int, Runs: int, SR: float, Ave: float, Country: string]

In [51]:
from pyspark.sql import Window
from pyspark.sql.functions import col, sum as spark_sum, when, row_number, round

# Define a window for cumulative calculations up to the previous season
window_spec = Window.partitionBy("Player", "Country").orderBy("Season").rowsBetween(Window.unboundedPreceding, -1)

# Window for row number to identify the first row per player and country
row_num_window = Window.partitionBy("Player", "Country").orderBy("Season")

# Calculate cumulative metrics excluding the current season and set to 0 if it's the first row
batting_data = batting_data.withColumn("row_num", row_number().over(row_num_window)) \
       .withColumn("Cum Mat Total", 
                   when(col("row_num") == 1, 0)
                   .otherwise(spark_sum("Mat").over(window_spec))) \
       .withColumn("Cum Inns Total", 
                   when(col("row_num") == 1, 0)
                   .otherwise(spark_sum("Inns").over(window_spec))) \
       .withColumn("Cum Runs Total", 
                   when(col("row_num") == 1, 0)
                   .otherwise(spark_sum("Runs").over(window_spec))) \
       .withColumn("Cum Batting Ave", 
                   when(col("row_num") == 1, 0)
                   .otherwise(
                       round(when(spark_sum("Inns").over(window_spec) != 0,
                                  spark_sum(col("Inns") * col("Ave")).over(window_spec) / spark_sum("Inns").over(window_spec))
                             .otherwise(0), 2))) \
       .withColumn("Cum SR", 
                   when(col("row_num") == 1, 0)
                   .otherwise(
                       round(when(spark_sum("Inns").over(window_spec) != 0,
                                  spark_sum(col("Inns") * col("SR")).over(window_spec) / spark_sum("Inns").over(window_spec))
                             .otherwise(0), 2))) \
       .drop("row_num")

# Show the resulting DataFrame
batting_data.show(5)

+----------+-------+---+----+----+-----+----+-------+-------------+--------------+--------------+---------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave|Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|
+----------+-------+---+----+----+-----+----+-------+-------------+--------------+--------------+---------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|    BUL|            0|             0|             0|            0.0|   0.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|    BUL|            3|             2|            16|            8.0| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|    BUL|            4|             3|            24|            8.0| 100.0|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|    BUL|            6|             4|            26|            6.5| 82.14|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|    BUL|            9|             7|            31|           4.43| 63.42|
+----------+-------+---+----+---

In [52]:
batting_data = batting_data.select(['Player','Country','Season','Cum Mat Total','Cum Inns Total','Cum Runs Total','Cum Batting Ave','Cum SR'])
batting_data.show(5)

+----------+-------+-------+-------------+--------------+--------------+---------------+------+
|    Player|Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|
+----------+-------+-------+-------------+--------------+--------------+---------------+------+
|A Ahmadhel|    BUL|2019/20|            0|             0|             0|            0.0|   0.0|
|A Ahmadhel|    BUL|   2020|            3|             2|            16|            8.0| 100.0|
|A Ahmadhel|    BUL|2020/21|            4|             3|            24|            8.0| 100.0|
|A Ahmadhel|    BUL|   2021|            6|             4|            26|            6.5| 82.14|
|A Ahmadhel|    BUL|   2023|            9|             7|            31|           4.43| 63.42|
+----------+-------+-------+-------------+--------------+--------------+---------------+------+
only showing top 5 rows



In [53]:
batting_data.filter(col("Player").contains("V Kohli")).show()

+-------+-------+-------+-------------+--------------+--------------+---------------+------+
| Player|Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|
+-------+-------+-------+-------------+--------------+--------------+---------------+------+
|V Kohli|    IND|   2010|            0|             0|             0|            0.0|   0.0|
|V Kohli|    IND|2010/11|            2|             1|            26|           26.0| 123.8|
|V Kohli|    IND|   2011|            3|             2|            54|           27.0|135.58|
|V Kohli|    IND|2011/12|            5|             4|            72|           18.0|120.73|
|V Kohli|    IND|   2012|            9|             7|           140|           20.0|116.76|
|V Kohli|    IND|2012/13|           11|             9|           278|          30.89|125.27|
|V Kohli|    IND|2013/14|           20|            18|           558|          32.94|125.98|
|V Kohli|    IND|   2014|           27|            25|           906| 

In [54]:
country_codes = {
    'LES': 'Lesotho',
    'BUL': 'Bulgaria',
    'VAN': 'Vanuatu',
    'ROM': 'Romania',
    'Aut': 'Austria',
    'COK': 'Cook Islands',
    'Fran': 'France',
    'SRB': 'Serbia',
    'PAK': 'Pakistan',
    'HUN': 'Hungary',
    'CYP': 'Cyprus',
    'Fiji': 'Fiji',
    'FIN': 'Finland',
    'EST': 'Estonia',
    'CHN': 'China',
    'GRC': 'Greece',
    'CAM': 'Cambodia',
    'GUE': 'Guernsey',
    'SEY': 'Seychelles',
    'JPN': 'Japan',
    'TAN': 'Tanzania',
    'JER': 'Jersey',
    'QAT': 'Qatar',
    'ENG': 'England',
    'UGA': 'Uganda',
    'BER': 'Bermuda',
    'CZK-R': 'Czech Republic',
    'CAY': 'Cayman Islands',
    'IRE': 'Ireland',
    'Mali': 'Mali',
    'BRA': 'Brazil',
    'SUI': 'Switzerland',
    'Peru': 'Peru',
    'Mex': 'Mexico',
    'MOZ': 'Mozambique',
    'Samoa': 'Samoa',
    'HKG': 'Hong Kong',
    'BAN': 'Bangladesh',
    'SL': 'Sri Lanka',
    'PNG': 'Papua New Guinea',
    'ZIM': 'Zimbabwe',
    'GHA': 'Ghana',
    'SWZ': 'Eswatini',  # Swaziland's official name now is Eswatini
    'MYAN': 'Myanmar',
    'IND': 'India',
    'USA': 'United States of America',
    'NEP': 'Nepal',
    'AFG': 'Afghanistan',
    'PAN': 'Panama',
    'NGA': 'Nigeria',
    'SLE': 'Sierra Leone',
    'ESP': 'Spain',
    'Bhm': 'Bahamas',
    'TKY': 'Turkey',
    'MWI': 'Malawi',
    'WI': 'West Indies',
    'IOM': 'Isle of Man',
    'THA': 'Thailand',
    'SWA': 'Eswatini',  # another code for Eswatini
    'SKOR': 'South Korea',
    'GMB': 'Gambia',
    'ISR': 'Israel',
    'KUW': 'Kuwait',
    'Belg': 'Belgium',
    'GER': 'Germany',
    'ITA': 'Italy',
    'CAN': 'Canada',
    'MDV': 'Maldives',
    'Blz': 'Belize',
    'DEN': 'Denmark',
    'INA': 'Indonesia',
    'KENYA': 'Kenya',
    'LUX': 'Luxembourg',
    'STHEL': 'Saint Helena',
    'BHR': 'Bahrain',
    'KSA': 'Saudi Arabia',
    'MLT': 'Malta',
    'Arg': 'Argentina',
    'MNG': 'Mongolia',
    'AUS': 'Australia',
    'GIBR': 'Gibraltar',
    'SGP': 'Singapore',
    'Chile': 'Chile',
    'UAE': 'United Arab Emirates',
    'NZ': 'New Zealand',
    'SCOT': 'Scotland',
    'BHU': 'Bhutan',
    'MAS': 'Malaysia',
    'BOT': 'Botswana',
    'CRC': 'Costa Rica',
    'PHI': 'Philippines',
    'NAM': 'Namibia',
    'RWN': 'Rwanda',
    'OMA': 'Oman',
    'NOR': 'Norway',
    'CRT': 'Croatia',
    'SWE': 'Sweden',
    'Iran': 'Iran',
    'PORT': 'Portugal',
    'NED': 'Netherlands',
    'SA': 'South Africa',
    'SVN': 'Slovenia',
    'GUE': 'Guernsey',
    'MDV': 'Maldives',
    'BHM': 'Bahamas',
    'SWE': 'Sweden',
    'MLT': 'Malta',
    'ITA': 'Italy',
}

# ICC and World teams
icc_world = {
    'ICC/PAK': 'Pakistan',
    'ICC/SL': 'Sri Lanka',
    'ICC/IND': 'India',
    'ICC/NEP': 'Nepal',
    'BAN/ICC': 'Bangladesh',
    'AFG/ICC': 'Afghanistan',
    'SL/World': 'Sri Lanka',
    'SA/World': 'South Africa',
    'AUS/World': 'Australia',
    'BAN/World': 'Bangladesh',
    'WI/World': 'West Indies',
}

# Outlier/Miscellaneous Countries
outlier_countries = {
    '1': 'Miscellaneous Country 1',
    '2': 'Miscellaneous Country 2',
    '3': 'Miscellaneous Country 3',
    'ICC': 'International Cricket Council',
    'World': 'World XI',
}

# Filtered country codes excluding ICC, World teams, and miscellaneous
filtered_countries = {
    code: country
    for code, country in country_codes.items()
    if code not in icc_world and code not in outlier_countries
}
len(country_codes), len(icc_world), len(outlier_countries), len(filtered_countries)

(103, 11, 5, 103)

In [55]:
batting_data = batting_data.filter(col('Country').isin(list(filtered_countries.keys())))
batting_data.show(5)

+----------+-------+-------+-------------+--------------+--------------+---------------+------+
|    Player|Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|
+----------+-------+-------+-------------+--------------+--------------+---------------+------+
|A Ahmadhel|    BUL|2019/20|            0|             0|             0|            0.0|   0.0|
|A Ahmadhel|    BUL|   2020|            3|             2|            16|            8.0| 100.0|
|A Ahmadhel|    BUL|2020/21|            4|             3|            24|            8.0| 100.0|
|A Ahmadhel|    BUL|   2021|            6|             4|            26|            6.5| 82.14|
|A Ahmadhel|    BUL|   2023|            9|             7|            31|           4.43| 63.42|
+----------+-------+-------+-------------+--------------+--------------+---------------+------+
only showing top 5 rows



In [56]:
batting_data = batting_data.replace(filtered_countries,subset=['Country'])
batting_data.show(5)

+----------+--------+-------+-------------+--------------+--------------+---------------+------+
|    Player| Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+
|A Ahmadhel|Bulgaria|2019/20|            0|             0|             0|            0.0|   0.0|
|A Ahmadhel|Bulgaria|   2020|            3|             2|            16|            8.0| 100.0|
|A Ahmadhel|Bulgaria|2020/21|            4|             3|            24|            8.0| 100.0|
|A Ahmadhel|Bulgaria|   2021|            6|             4|            26|            6.5| 82.14|
|A Ahmadhel|Bulgaria|   2023|            9|             7|            31|           4.43| 63.42|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+
only showing top 5 rows



In [57]:
batting_data[['Player','Country']].distinct().count(), batting_data[['Player']].distinct().count(), batting_data.distinct().count()

(4129, 4074, 14035)

In [58]:
players_data = spark.read.csv(r'D:\github\Cricket-Prediction\data\2_processedData\Players.csv', header=True, inferSchema=True)
players_data = players_data.withColumnRenamed("player", "Player").withColumnRenamed("country", "Country")
players_data.show()

+---------------+--------------------+---------+
|         Player|             Country|player_id|
+---------------+--------------------+---------+
|   M Rathnayake|               Italy| a969dc4c|
|          L Vua|                Fiji| 978a63d4|
|       B Shetty|                Oman| 58c16165|
|Santiago Iturbe|           Argentina| 851b0636|
|     CT Dhururu|            Zimbabwe| 8d7fba79|
|      C Olphert|             Ireland| 7afc806f|
|     JE Titmuss|           Hong Kong| 4a6c0fbb|
|     VR Vanitha|               India| 2d963999|
|  Nestor Dhamba|                Oman| b878fc9a|
|        D Ganga|         West Indies| d5f3b5e1|
|     M Mooketsi|            Botswana| 008a024c|
|     Ali Rasool|            Bulgaria| 1a710f40|
| Ignacio Lisboa|               Chile| eceeb63a|
|    Adnan Ilyas|                Oman| d376bbaa|
|        R Kanai|               Japan| 54a3d69f|
|  Sanjida Islam|          Bangladesh| 12755a62|
|    Aaron Jones|United States of ...| 17aa4f1d|
|      S Qeshile|   

In [59]:
batting_data = batting_data.join(players_data, ['Player', 'Country'], 'inner')
batting_data.show(5)

+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
|    Player| Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|player_id|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
|A Ahmadhel|Bulgaria|2019/20|            0|             0|             0|            0.0|   0.0| 55a5cffb|
|A Ahmadhel|Bulgaria|   2020|            3|             2|            16|            8.0| 100.0| 55a5cffb|
|A Ahmadhel|Bulgaria|2020/21|            4|             3|            24|            8.0| 100.0| 55a5cffb|
|A Ahmadhel|Bulgaria|   2021|            6|             4|            26|            6.5| 82.14| 55a5cffb|
|A Ahmadhel|Bulgaria|   2023|            9|             7|            31|           4.43| 63.42| 55a5cffb|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
only showing top 5 rows



In [60]:
batting_data.toPandas().to_csv(r'D:\github\Cricket-Prediction\data\2_processedData\batting.csv', index=False)