In [17]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


# Then proceed to import and use Spark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()


# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\1_rawData'
bowling_data = spark.read.csv(os.path.join(directory, 't20_bowling_stats.csv'), header=True, inferSchema=True)
bowling_data.show(5)

+--------------------+---+----+-----+----+----+----+----+-----+----+----+---+---+-------+
|              Player|Mat|Inns|Overs|Mdns|Runs|Wkts| BBI|  Ave|Econ|  SR|  4|  5| Season|
+--------------------+---+----+-----+----+----+----+----+-----+----+----+---+---+-------+
|Zulqarnain Haider...|  2|   -|    -|   -|   -|   -|   -|    -|   -|   -|  -|  -|2010/11|
|Zulqarnain Haider...|  1|   -|    -|   -|   -|   -|   -|    -|   -|   -|  -|  -|2006/07|
|Zulqarnain Haider...|  1|   1|  3.0|   0|   7|   0|   -|    -|2.33|   -|  0|  0|2019/20|
|Zulqarnain Haider...|  6|   6| 15.0|   0|  91|   5|2/26|18.20|6.06|18.0|  0|  0|   2022|
|Zulqarnain Haider...|  4|   4| 10.0|   0|  53|   3| 1/7|17.66|5.30|20.0|  0|  0|   2019|
+--------------------+---+----+-----+----+----+----+----+-----+----+----+---+---+-------+
only showing top 5 rows



In [18]:
bowling_data = bowling_data.select(["Player","Season","Mat","Inns",'Overs',"Runs","Wkts","Econ"]).sort(["Player","Season"])
bowling_data.show(5)

+----------------+-------+---+----+-----+----+----+-----+
|          Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|
+----------------+-------+---+----+-----+----+----+-----+
|A Ahmadhel (BUL)|2019/20|  3|   3| 10.0|  75|   3|  7.5|
|A Ahmadhel (BUL)|   2020|  1|   1|  2.0|  22|   1|11.00|
|A Ahmadhel (BUL)|2020/21|  2|   2|  2.4|  27|   2|10.12|
|A Ahmadhel (BUL)|   2021|  3|   2|  3.0|  31|   0|10.33|
|A Ahmadhel (BUL)|   2023|  2|   -|    -|   -|   -|    -|
+----------------+-------+---+----+-----+----+----+-----+
only showing top 5 rows



In [19]:
from pyspark.sql.functions import when, col
bowling_data = bowling_data.withColumn("Overs", when(col("Overs") == "-", "0").otherwise(col("Overs")).cast("float"))
bowling_data = bowling_data.withColumn("Wkts", when(col("Wkts") == "-", "0").otherwise(col("Wkts")).cast("float"))
bowling_data = bowling_data.withColumn("Inns", when(col("Inns") == "-", "0").otherwise(col("Inns")).cast("float"))
bowling_data = bowling_data.withColumn("Runs", when(col("Runs") == "-", "0").otherwise(col("Runs")).cast("float"))
bowling_data = bowling_data.withColumn("Econ", when(col("Econ") == "-", col("Runs")/col("Inns")).otherwise(col("Econ")).cast("float")).fillna(0)
bowling_data.show(5)

+----------------+-------+---+----+-----+----+----+-----+
|          Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|
+----------------+-------+---+----+-----+----+----+-----+
|A Ahmadhel (BUL)|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|
|A Ahmadhel (BUL)|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|
|A Ahmadhel (BUL)|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|
|A Ahmadhel (BUL)|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33|
|A Ahmadhel (BUL)|   2023|  2| 0.0|  0.0| 0.0| 0.0|  0.0|
+----------------+-------+---+----+-----+----+----+-----+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import regexp_extract, regexp_replace

# Extract the country name between '(' and ')'
bowling_data = bowling_data.withColumn("Country", regexp_extract(col("Player"), r"\((.*?)\)", 1))

# Extract the player's name before the first '('
bowling_data = bowling_data.withColumn("Player", regexp_extract(col("Player"), r"^(.*?)\s\(", 1))

# Show the result
bowling_data.show()

+--------------+-------+---+----+-----+-----+----+-----+-------+
|        Player| Season|Mat|Inns|Overs| Runs|Wkts| Econ|Country|
+--------------+-------+---+----+-----+-----+----+-----+-------+
|    A Ahmadhel|2019/20|  3| 3.0| 10.0| 75.0| 3.0|  7.5|    BUL|
|    A Ahmadhel|   2020|  1| 1.0|  2.0| 22.0| 1.0| 11.0|    BUL|
|    A Ahmadhel|2020/21|  2| 2.0|  2.4| 27.0| 2.0|10.12|    BUL|
|    A Ahmadhel|   2021|  3| 2.0|  3.0| 31.0| 0.0|10.33|    BUL|
|    A Ahmadhel|   2023|  2| 0.0|  0.0|  0.0| 0.0|  0.0|    BUL|
|    A Ahmadhel|   2024|  1| 0.0|  0.0|  0.0| 0.0|  0.0|    BUL|
|   A Alexander|2024/25|  1| 1.0|  3.3| 19.0| 2.0| 5.42|    INA|
|       A Amado|   2022|  3| 3.0|  6.0| 58.0| 3.0| 9.66|    ISR|
|     A Andrews|2021/22|  3| 3.0| 10.0| 61.0| 4.0|  6.1|    SUI|
|     A Andrews|   2022|  4| 1.0|  1.0| 21.0| 0.0| 21.0|    SUI|
|A Anemogiannis|2019/20|  1| 0.0|  0.0|  0.0| 0.0|  0.0|    GRC|
|       A Ashok|   2023|  1| 1.0|  4.0| 28.0| 1.0|  7.0|     NZ|
|     A Ashokan|2019/20| 

In [21]:
from pyspark.sql import Window
from pyspark.sql.functions import col, sum as spark_sum, when, row_number, round

# Define the window specification for cumulative calculations excluding the current row
window_spec = Window.partitionBy("Player", "Country").orderBy("Season").rowsBetween(Window.unboundedPreceding, -1)

# Window for row number to identify the first row per player and country
row_num_window = Window.partitionBy("Player", "Country").orderBy("Season")

# Perform cumulative calculations with conditions
bowling_data = bowling_data.withColumn("row_num", row_number().over(row_num_window)) \
    .withColumn("Cumulative Overs", 
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Overs").over(window_spec))) \
    .withColumn("Cumulative Wkts", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Wkts").over(window_spec))) \
    .withColumn("Cumulative Runs", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Runs").over(window_spec))) \
    .withColumn("Cumulative Inns", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Inns").over(window_spec))) \
    .withColumn(
        "Cumulative Econ", 
        when(col("row_num") == 1, 0)
        .otherwise(
            round(
                when(spark_sum("Inns").over(window_spec) != 0, 
                     spark_sum(col("Inns")*col("Econ")).over(window_spec) / spark_sum("Inns").over(window_spec))
                .otherwise(0), 2)
        )
    ) \
    .drop("row_num")  # Drop the temporary row number column

# Show the resulting DataFrame
bowling_data.show(5)

+----------+-------+---+----+-----+----+----+-----+-------+------------------+---------------+---------------+---------------+---------------+
|    Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|Country|  Cumulative Overs|Cumulative Wkts|Cumulative Runs|Cumulative Inns|Cumulative Econ|
+----------+-------+---+----+-----+----+----+-----+-------+------------------+---------------+---------------+---------------+---------------+
|A Ahmadhel|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|    BUL|               0.0|            0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|    BUL|              10.0|            3.0|           75.0|            3.0|            7.5|
|A Ahmadhel|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|    BUL|              12.0|            4.0|           97.0|            4.0|           8.38|
|A Ahmadhel|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33|    BUL|14.400000095367432|            6.0|          124.0|            6.0|           8.96|

In [22]:
country_codes = {
    'LES': 'Lesotho',
    'BUL': 'Bulgaria',
    'VAN': 'Vanuatu',
    'ROM': 'Romania',
    'Aut': 'Austria',
    'COK': 'Cook Islands',
    'Fran': 'France',
    'SRB': 'Serbia',
    'PAK': 'Pakistan',
    'HUN': 'Hungary',
    'CYP': 'Cyprus',
    'Fiji': 'Fiji',
    'FIN': 'Finland',
    'EST': 'Estonia',
    'CHN': 'China',
    'GRC': 'Greece',
    'CAM': 'Cambodia',
    'GUE': 'Guernsey',
    'SEY': 'Seychelles',
    'JPN': 'Japan',
    'TAN': 'Tanzania',
    'JER': 'Jersey',
    'QAT': 'Qatar',
    'ENG': 'England',
    'UGA': 'Uganda',
    'BER': 'Bermuda',
    'CZK-R': 'Czech Republic',
    'CAY': 'Cayman Islands',
    'IRE': 'Ireland',
    'Mali': 'Mali',
    'BRA': 'Brazil',
    'SUI': 'Switzerland',
    'Peru': 'Peru',
    'Mex': 'Mexico',
    'MOZ': 'Mozambique',
    'Samoa': 'Samoa',
    'HKG': 'Hong Kong',
    'BAN': 'Bangladesh',
    'SL': 'Sri Lanka',
    'PNG': 'Papua New Guinea',
    'ZIM': 'Zimbabwe',
    'GHA': 'Ghana',
    'SWZ': 'Eswatini',  # Swaziland's official name now is Eswatini
    'MYAN': 'Myanmar',
    'IND': 'India',
    'USA': 'United States of America',
    'NEP': 'Nepal',
    'AFG': 'Afghanistan',
    'PAN': 'Panama',
    'NGA': 'Nigeria',
    'SLE': 'Sierra Leone',
    'ESP': 'Spain',
    'Bhm': 'Bahamas',
    'TKY': 'Turkey',
    'MWI': 'Malawi',
    'WI': 'West Indies',
    'IOM': 'Isle of Man',
    'THA': 'Thailand',
    'SWA': 'Eswatini',  # another code for Eswatini
    'SKOR': 'South Korea',
    'GMB': 'Gambia',
    'ISR': 'Israel',
    'KUW': 'Kuwait',
    'Belg': 'Belgium',
    'GER': 'Germany',
    'ITA': 'Italy',
    'CAN': 'Canada',
    'MDV': 'Maldives',
    'Blz': 'Belize',
    'DEN': 'Denmark',
    'INA': 'Indonesia',
    'KENYA': 'Kenya',
    'LUX': 'Luxembourg',
    'STHEL': 'Saint Helena',
    'BHR': 'Bahrain',
    'KSA': 'Saudi Arabia',
    'MLT': 'Malta',
    'Arg': 'Argentina',
    'MNG': 'Mongolia',
    'AUS': 'Australia',
    'GIBR': 'Gibraltar',
    'SGP': 'Singapore',
    'Chile': 'Chile',
    'UAE': 'United Arab Emirates',
    'NZ': 'New Zealand',
    'SCOT': 'Scotland',
    'BHU': 'Bhutan',
    'MAS': 'Malaysia',
    'BOT': 'Botswana',
    'CRC': 'Costa Rica',
    'PHI': 'Philippines',
    'NAM': 'Namibia',
    'RWN': 'Rwanda',
    'OMA': 'Oman',
    'NOR': 'Norway',
    'CRT': 'Croatia',
    'SWE': 'Sweden',
    'Iran': 'Iran',
    'PORT': 'Portugal',
    'NED': 'Netherlands',
    'SA': 'South Africa',
    'SVN': 'Slovenia',
    'GUE': 'Guernsey',
    'MDV': 'Maldives',
    'BHM': 'Bahamas',
    'SWE': 'Sweden',
    'MLT': 'Malta',
    'ITA': 'Italy',
}

# ICC and World teams
icc_world = {
    'ICC/PAK': 'Pakistan',
    'ICC/SL': 'Sri Lanka',
    'ICC/IND': 'India',
    'ICC/NEP': 'Nepal',
    'BAN/ICC': 'Bangladesh',
    'AFG/ICC': 'Afghanistan',
    'SL/World': 'Sri Lanka',
    'SA/World': 'South Africa',
    'AUS/World': 'Australia',
    'BAN/World': 'Bangladesh',
    'WI/World': 'West Indies',
}

# Outlier/Miscellaneous Countries
outlier_countries = {
    '1': 'Miscellaneous Country 1',
    '2': 'Miscellaneous Country 2',
    '3': 'Miscellaneous Country 3',
    'ICC': 'International Cricket Council',
    'World': 'World XI',
}

# Filtered country codes excluding ICC, World teams, and miscellaneous
filtered_countries = {
    code: country
    for code, country in country_codes.items()
    if code not in icc_world and code not in outlier_countries
}
len(country_codes), len(icc_world), len(outlier_countries), len(filtered_countries)

(103, 11, 5, 103)

In [23]:
bowling_data = bowling_data.filter(col('Country').isin(list(filtered_countries.keys())))
bowling_data.show(5)

+----------+-------+---+----+-----+----+----+-----+-------+------------------+---------------+---------------+---------------+---------------+
|    Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|Country|  Cumulative Overs|Cumulative Wkts|Cumulative Runs|Cumulative Inns|Cumulative Econ|
+----------+-------+---+----+-----+----+----+-----+-------+------------------+---------------+---------------+---------------+---------------+
|A Ahmadhel|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|    BUL|               0.0|            0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|    BUL|              10.0|            3.0|           75.0|            3.0|            7.5|
|A Ahmadhel|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|    BUL|              12.0|            4.0|           97.0|            4.0|           8.38|
|A Ahmadhel|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33|    BUL|14.400000095367432|            6.0|          124.0|            6.0|           8.96|

In [24]:
bowling_data = bowling_data.replace(filtered_countries,subset=['Country'])
bowling_data.show(5)

+----------+-------+---+----+-----+----+----+-----+--------+------------------+---------------+---------------+---------------+---------------+
|    Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ| Country|  Cumulative Overs|Cumulative Wkts|Cumulative Runs|Cumulative Inns|Cumulative Econ|
+----------+-------+---+----+-----+----+----+-----+--------+------------------+---------------+---------------+---------------+---------------+
|A Ahmadhel|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|Bulgaria|               0.0|            0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|Bulgaria|              10.0|            3.0|           75.0|            3.0|            7.5|
|A Ahmadhel|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|Bulgaria|              12.0|            4.0|           97.0|            4.0|           8.38|
|A Ahmadhel|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33|Bulgaria|14.400000095367432|            6.0|          124.0|            6.0|         