In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


# Then proceed to import and use Spark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()


# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\1_rawData'
fielding_data = spark.read.csv(os.path.join(directory, 't20_fielding_stats.csv'), header=True, inferSchema=True)
fielding_data.show(5)

+--------------------+---+----+---+---+---+-----+-----+-----------+-----+-------+
|              Player|Mat|Inns|Dis| Ct| St|Ct Wk|Ct Fi|         MD|  D/I| Season|
+--------------------+---+----+---+---+---+-----+-----+-----------+-----+-------+
|Zulqarnain Haider...|  2|   2|  1|  0|  1|    0|    0|1 (0ct 1st)|  0.5|2010/11|
|Zulqarnain Haider...|  1|   1|  0|  0|  0|    0|    0|          0|  0.0|2006/07|
|Zulqarnain Haider...|  6|   6|  0|  0|  0|    0|    0|          0|  0.0|   2022|
|Zulqarnain Haider...|  1|   1|  0|  0|  0|    0|    0|          0|0.000|2019/20|
|Zulqarnain Haider...|  4|   4|  1|  1|  0|    0|    1|1 (1ct 0st)| 0.25|   2019|
+--------------------+---+----+---+---+---+-----+-----+-----------+-----+-------+
only showing top 5 rows



In [2]:
fielding_data = fielding_data.select(['Player',"Mat","Inns","Dis","Ct","St","D/I","Season"]).sort(["Player","Season"])
fielding_data.show(5)

+----------------+---+----+---+---+---+-----+-------+
|          Player|Mat|Inns|Dis| Ct| St|  D/I| Season|
+----------------+---+----+---+---+---+-----+-------+
|A Ahmadhel (BUL)|  3|   3|  0|  0|  0|  0.0|2019/20|
|A Ahmadhel (BUL)|  1|   1|  0|  0|  0|0.000|   2020|
|A Ahmadhel (BUL)|  2|   2|  0|  0|  0|0.000|2020/21|
|A Ahmadhel (BUL)|  3|   3|  0|  0|  0|  0.0|   2021|
|A Ahmadhel (BUL)|  2|   1|  0|  0|  0|  0.0|   2023|
+----------------+---+----+---+---+---+-----+-------+
only showing top 5 rows



In [3]:
from pyspark.sql.functions import col, when
fielding_data = fielding_data.withColumn('Inns', when(col('Inns') == '-', '0').otherwise(col('Inns')).cast('float'))
fielding_data = fielding_data.withColumn('Dis', when(col('Dis') == '-', '0').otherwise(col('Dis')).cast('float'))
fielding_data = fielding_data.withColumn('Ct', when(col('Ct') == '-', '0').otherwise(col('Ct')).cast('float'))
fielding_data = fielding_data.withColumn('St', when(col('St') == '-', '0').otherwise(col('St')).cast('float'))
fielding_data = fielding_data.withColumn('D/I', when(col('D/I') == '-', col('Dis')/col('Inns')).otherwise(col('D/I')).cast('float')).fillna(0)
fielding_data.show(5)

+----------------+---+----+---+---+---+---+-------+
|          Player|Mat|Inns|Dis| Ct| St|D/I| Season|
+----------------+---+----+---+---+---+---+-------+
|A Ahmadhel (BUL)|  3| 3.0|0.0|0.0|0.0|0.0|2019/20|
|A Ahmadhel (BUL)|  1| 1.0|0.0|0.0|0.0|0.0|   2020|
|A Ahmadhel (BUL)|  2| 2.0|0.0|0.0|0.0|0.0|2020/21|
|A Ahmadhel (BUL)|  3| 3.0|0.0|0.0|0.0|0.0|   2021|
|A Ahmadhel (BUL)|  2| 1.0|0.0|0.0|0.0|0.0|   2023|
+----------------+---+----+---+---+---+---+-------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import regexp_extract, regexp_replace

# Extract the country name between '(' and ')'
fielding_data = fielding_data.withColumn("Country", regexp_extract(col("Player"), r"\((.*?)\)", 1))

# Extract the player's name before the first '('
fielding_data = fielding_data.withColumn("Player", regexp_extract(col("Player"), r"^(.*?)\s\(", 1))

# Show the result
fielding_data.show(5)

+----------+---+----+---+---+---+---+-------+-------+
|    Player|Mat|Inns|Dis| Ct| St|D/I| Season|Country|
+----------+---+----+---+---+---+---+-------+-------+
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|2019/20|    BUL|
|A Ahmadhel|  1| 1.0|0.0|0.0|0.0|0.0|   2020|    BUL|
|A Ahmadhel|  2| 2.0|0.0|0.0|0.0|0.0|2020/21|    BUL|
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|   2021|    BUL|
|A Ahmadhel|  2| 1.0|0.0|0.0|0.0|0.0|   2023|    BUL|
+----------+---+----+---+---+---+---+-------+-------+
only showing top 5 rows



In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import col, sum as spark_sum, when, row_number, round

# Define the window specification for cumulative calculations
window_spec = Window.partitionBy("Player", "Country").orderBy("Season").rowsBetween(Window.unboundedPreceding, -1)

# Window for row number to identify the first row per player and country
row_num_window = Window.partitionBy("Player", "Country").orderBy("Season")

# Perform cumulative calculations with conditions
fielding_data = fielding_data.withColumn("row_num", row_number().over(row_num_window)) \
    .withColumn("Cumulative Dis", 
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Dis").over(window_spec))) \
    .withColumn("Cumulative Ct", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Ct").over(window_spec))) \
    .withColumn("Cumulative St", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("St").over(window_spec))) \
    .withColumn("Cumulative D/I", 
                when(col("row_num") == 1, 0)
                .otherwise(
                    round(
                        when(spark_sum("Inns").over(window_spec) != 0, 
                             spark_sum("Dis").over(window_spec) / spark_sum("Inns").over(window_spec))
                        .otherwise(0), 2)
                )
    ) \
    .drop("row_num")  # Drop the temporary row number column

# Show the resulting DataFrame
fielding_data.show(10)

+-----------+---+----+---+---+---+-----+-------+-------+--------------+-------------+-------------+--------------+
|     Player|Mat|Inns|Dis| Ct| St|  D/I| Season|Country|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+-----------+---+----+---+---+---+-----+-------+-------+--------------+-------------+-------------+--------------+
| A Ahmadhel|  3| 3.0|0.0|0.0|0.0|  0.0|2019/20|    BUL|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|  1| 1.0|0.0|0.0|0.0|  0.0|   2020|    BUL|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|  2| 2.0|0.0|0.0|0.0|  0.0|2020/21|    BUL|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|  3| 3.0|0.0|0.0|0.0|  0.0|   2021|    BUL|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|  2| 1.0|0.0|0.0|0.0|  0.0|   2023|    BUL|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|  1| 1.0|1.0|1.0|0.0|  1.0|   2024|    BUL|           0.0|         

In [7]:
# calculate null count
from pyspark.sql.functions import isnan, when, count

fielding_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in fielding_data.columns]).show()

+------+---+----+---+---+---+---+------+-------+--------------+-------------+-------------+--------------+
|Player|Mat|Inns|Dis| Ct| St|D/I|Season|Country|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+------+---+----+---+---+---+---+------+-------+--------------+-------------+-------------+--------------+
|     0|  0|   0|  0|  0|  0|  0|     0|      0|             0|            0|            0|             0|
+------+---+----+---+---+---+---+------+-------+--------------+-------------+-------------+--------------+



In [8]:
country_codes = {
    'LES': 'Lesotho',
    'BUL': 'Bulgaria',
    'VAN': 'Vanuatu',
    'ROM': 'Romania',
    'Aut': 'Austria',
    'COK': 'Cook Islands',
    'Fran': 'France',
    'SRB': 'Serbia',
    'PAK': 'Pakistan',
    'HUN': 'Hungary',
    'CYP': 'Cyprus',
    'Fiji': 'Fiji',
    'FIN': 'Finland',
    'EST': 'Estonia',
    'CHN': 'China',
    'GRC': 'Greece',
    'CAM': 'Cambodia',
    'GUE': 'Guernsey',
    'SEY': 'Seychelles',
    'JPN': 'Japan',
    'TAN': 'Tanzania',
    'JER': 'Jersey',
    'QAT': 'Qatar',
    'ENG': 'England',
    'UGA': 'Uganda',
    'BER': 'Bermuda',
    'CZK-R': 'Czech Republic',
    'CAY': 'Cayman Islands',
    'IRE': 'Ireland',
    'Mali': 'Mali',
    'BRA': 'Brazil',
    'SUI': 'Switzerland',
    'Peru': 'Peru',
    'Mex': 'Mexico',
    'MOZ': 'Mozambique',
    'Samoa': 'Samoa',
    'HKG': 'Hong Kong',
    'BAN': 'Bangladesh',
    'SL': 'Sri Lanka',
    'PNG': 'Papua New Guinea',
    'ZIM': 'Zimbabwe',
    'GHA': 'Ghana',
    'SWZ': 'Eswatini',  # Swaziland's official name now is Eswatini
    'MYAN': 'Myanmar',
    'IND': 'India',
    'USA': 'United States of America',
    'NEP': 'Nepal',
    'AFG': 'Afghanistan',
    'PAN': 'Panama',
    'NGA': 'Nigeria',
    'SLE': 'Sierra Leone',
    'ESP': 'Spain',
    'Bhm': 'Bahamas',
    'TKY': 'Turkey',
    'MWI': 'Malawi',
    'WI': 'West Indies',
    'IOM': 'Isle of Man',
    'THA': 'Thailand',
    'SWA': 'Eswatini',  # another code for Eswatini
    'SKOR': 'South Korea',
    'GMB': 'Gambia',
    'ISR': 'Israel',
    'KUW': 'Kuwait',
    'Belg': 'Belgium',
    'GER': 'Germany',
    'ITA': 'Italy',
    'CAN': 'Canada',
    'MDV': 'Maldives',
    'Blz': 'Belize',
    'DEN': 'Denmark',
    'INA': 'Indonesia',
    'KENYA': 'Kenya',
    'LUX': 'Luxembourg',
    'STHEL': 'Saint Helena',
    'BHR': 'Bahrain',
    'KSA': 'Saudi Arabia',
    'MLT': 'Malta',
    'Arg': 'Argentina',
    'MNG': 'Mongolia',
    'AUS': 'Australia',
    'GIBR': 'Gibraltar',
    'SGP': 'Singapore',
    'Chile': 'Chile',
    'UAE': 'United Arab Emirates',
    'NZ': 'New Zealand',
    'SCOT': 'Scotland',
    'BHU': 'Bhutan',
    'MAS': 'Malaysia',
    'BOT': 'Botswana',
    'CRC': 'Costa Rica',
    'PHI': 'Philippines',
    'NAM': 'Namibia',
    'RWN': 'Rwanda',
    'OMA': 'Oman',
    'NOR': 'Norway',
    'CRT': 'Croatia',
    'SWE': 'Sweden',
    'Iran': 'Iran',
    'PORT': 'Portugal',
    'NED': 'Netherlands',
    'SA': 'South Africa',
    'SVN': 'Slovenia',
    'GUE': 'Guernsey',
    'MDV': 'Maldives',
    'BHM': 'Bahamas',
    'SWE': 'Sweden',
    'MLT': 'Malta',
    'ITA': 'Italy',
}

# ICC and World teams
icc_world = {
    'ICC/PAK': 'Pakistan',
    'ICC/SL': 'Sri Lanka',
    'ICC/IND': 'India',
    'ICC/NEP': 'Nepal',
    'BAN/ICC': 'Bangladesh',
    'AFG/ICC': 'Afghanistan',
    'SL/World': 'Sri Lanka',
    'SA/World': 'South Africa',
    'AUS/World': 'Australia',
    'BAN/World': 'Bangladesh',
    'WI/World': 'West Indies',
}

# Outlier/Miscellaneous Countries
outlier_countries = {
    '1': 'Miscellaneous Country 1',
    '2': 'Miscellaneous Country 2',
    '3': 'Miscellaneous Country 3',
    'ICC': 'International Cricket Council',
    'World': 'World XI',
}

# Filtered country codes excluding ICC, World teams, and miscellaneous
filtered_countries = {
    code: country
    for code, country in country_codes.items()
    if code not in icc_world and code not in outlier_countries
}
len(country_codes), len(icc_world), len(outlier_countries), len(filtered_countries)

(103, 11, 5, 103)

In [9]:
fielding_data = fielding_data.filter(col('Country').isin(list(filtered_countries.keys())))
fielding_data.show(5)

+----------+---+----+---+---+---+---+-------+-------+--------------+-------------+-------------+--------------+
|    Player|Mat|Inns|Dis| Ct| St|D/I| Season|Country|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+----------+---+----+---+---+---+---+-------+-------+--------------+-------------+-------------+--------------+
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|2019/20|    BUL|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  1| 1.0|0.0|0.0|0.0|0.0|   2020|    BUL|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  2| 2.0|0.0|0.0|0.0|0.0|2020/21|    BUL|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|   2021|    BUL|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  2| 1.0|0.0|0.0|0.0|0.0|   2023|    BUL|           0.0|          0.0|          0.0|           0.0|
+----------+---+----+---+---+---+---+-------+-------+--------------+-------------+-------------+--------

In [10]:
fielding_data = fielding_data.replace(filtered_countries,subset=['Country'])
fielding_data.show(5)

+----------+---+----+---+---+---+---+-------+--------+--------------+-------------+-------------+--------------+
|    Player|Mat|Inns|Dis| Ct| St|D/I| Season| Country|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+----------+---+----+---+---+---+---+-------+--------+--------------+-------------+-------------+--------------+
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|2019/20|Bulgaria|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  1| 1.0|0.0|0.0|0.0|0.0|   2020|Bulgaria|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  2| 2.0|0.0|0.0|0.0|0.0|2020/21|Bulgaria|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  3| 3.0|0.0|0.0|0.0|0.0|   2021|Bulgaria|           0.0|          0.0|          0.0|           0.0|
|A Ahmadhel|  2| 1.0|0.0|0.0|0.0|0.0|   2023|Bulgaria|           0.0|          0.0|          0.0|           0.0|
+----------+---+----+---+---+---+---+-------+--------+--------------+-------------+-------------

In [11]:
fielding_data[['Player','Country']].distinct().count(), fielding_data[['Player']].distinct().count(), fielding_data.distinct().count()

(4129, 4074, 14035)