In [None]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Then proceed to import and use Spark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()


# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\1_rawData'
batting_data = spark.read.csv(os.path.join(directory, 't20_batting_stats.csv'), header=True, inferSchema=True)
batting_data.show(5)

+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
|              Player|Mat|Inns| NO|Runs| HS| Ave| BF|   SR|100| 50|  0| 4s| 6s| Season|
+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
|Zulqarnain Haider...|  2|   2|  0|  18| 17|9.00| 21|85.71|  0|  0|  0|  1|  0|2010/11|
|Zulqarnain Haider...|  1|   1|  0|   5|  5|5.00|  9|55.55|  0|  0|  0|  0|  0|2006/07|
|Zulqarnain Haider...|  1|   -|  -|   -|  -|   -|  -|    -|  -|  -|  -|  -|  -|2019/20|
|Zulqarnain Haider...|  4|   1|  0|   0|  0|0.00|  0|    -|  0|  0|  1|  0|  0|   2019|
|Zulqarnain Haider...|  6|   3|  1|   8|  4|4.00| 12|66.66|  0|  0|  0|  0|  0|   2022|
+--------------------+---+----+---+----+---+----+---+-----+---+---+---+---+---+-------+
only showing top 5 rows



In [2]:
batting_data = batting_data.select("Player","Season","Mat","Inns","Runs","SR","Ave").sort("Player","Season")
batting_data.show(5)

+----------------+-------+---+----+----+------+----+
|          Player| Season|Mat|Inns|Runs|    SR| Ave|
+----------------+-------+---+----+----+------+----+
|A Ahmadhel (BUL)|2019/20|  3|   2|  16|100.00|8.00|
|A Ahmadhel (BUL)|   2020|  1|   1|   8|100.00|   -|
|A Ahmadhel (BUL)|2020/21|  2|   1|   2| 28.57|2.00|
|A Ahmadhel (BUL)|   2021|  3|   3|   5| 38.46|1.66|
|A Ahmadhel (BUL)|   2023|  2|   -|   -|     -|   -|
+----------------+-------+---+----+----+------+----+
only showing top 5 rows



In [3]:
from pyspark.sql.functions import when,col
batting_data = batting_data.withColumn("Inns", when(col("Inns") == "-", "0").otherwise(col("Inns")).cast("int"))
batting_data = batting_data.withColumn("Runs", when(col("Runs") == "-", "0").otherwise(col("Runs")).cast("int"))
batting_data = batting_data.withColumn("SR", when(col("SR") == "-", "0").otherwise(col("SR")).cast("float"))
batting_data = batting_data.withColumn("Ave", when(col("Ave") == "-", col("Runs")/col("Inns")).otherwise(col("Ave")).cast("float")).na.fill(0)
batting_data.show(5)

+----------------+-------+---+----+----+-----+----+
|          Player| Season|Mat|Inns|Runs|   SR| Ave|
+----------------+-------+---+----+----+-----+----+
|A Ahmadhel (BUL)|2019/20|  3|   2|  16|100.0| 8.0|
|A Ahmadhel (BUL)|   2020|  1|   1|   8|100.0| 8.0|
|A Ahmadhel (BUL)|2020/21|  2|   1|   2|28.57| 2.0|
|A Ahmadhel (BUL)|   2021|  3|   3|   5|38.46|1.66|
|A Ahmadhel (BUL)|   2023|  2|   0|   0|  0.0| 0.0|
+----------------+-------+---+----+----+-----+----+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import regexp_extract, regexp_replace

# Extract the country name between '(' and ')'
batting_data = batting_data.withColumn("Country", regexp_extract(col("Player"), r"\((.*?)\)", 1))

# Extract the player's name before the first '('
batting_data = batting_data.withColumn("Player", regexp_extract(col("Player"), r"^(.*?)\s\(", 1))

# Show the result
display(batting_data)

DataFrame[Player: string, Season: string, Mat: int, Inns: int, Runs: int, SR: float, Ave: float, Country: string]

In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import col,sum as spark_sum,avg, round as spark_round, when, count

player_country_window = Window.partitionBy('Player','Country').orderBy('Player','Country') \
                        .rowsBetween(Window.unboundedPreceding,Window.currentRow)

batting_data = batting_data.withColumn(
    "Cum batting Ave",
    when(spark_sum("Inns").over(player_country_window) != 0,
         spark_round(spark_sum(col("Inns") * col("Ave")).over(player_country_window) / spark_sum("Inns").over(player_country_window), 2))
    .otherwise(0)
)
# Calculate cumulative runs total
batting_data = batting_data.withColumn(
    "Cum battings Runs Total",
    spark_sum("Runs").over(player_country_window)
)

# Calculate cumulative innings total
batting_data = batting_data.withColumn(
    "Cum Inns Total",
    spark_sum("Inns").over(player_country_window)
)

# Calculate cumulative strike rate, handling division by zero
batting_data = batting_data.withColumn(
    "Cum SR",
    when(count("Inns").over(player_country_window) != 0,
         spark_round(spark_sum("SR").over(player_country_window) / count("Inns").over(player_country_window), 2))
    .otherwise(0)
)

# Show the resulting DataFrame
batting_data.show(5)

+----------+-------+---+----+----+-----+----+-------+---------------+-----------------------+--------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave|Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+----------+-------+---+----+----+-----+----+-------+---------------+-----------------------+--------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|    BUL|            8.0|                     16|             2| 100.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|    BUL|            8.0|                     24|             3| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|    BUL|            6.5|                     26|             4| 76.19|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|    BUL|           4.43|                     31|             7| 66.76|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|    BUL|           4.43|                     31|             7| 53.41|
+----------+-------+---+----+----+-----+----+-------+---------------+---

In [6]:
batting_data.filter(col("Player").contains("V Kohli")).show()

+-------+-------+---+----+----+------+-----+-------+---------------+-----------------------+--------------+------+
| Player| Season|Mat|Inns|Runs|    SR|  Ave|Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+-------+-------+---+----+----+------+-----+-------+---------------+-----------------------+--------------+------+
|V Kohli|   2010|  2|   1|  26| 123.8| 26.0|    IND|           26.0|                     26|             1| 123.8|
|V Kohli|2010/11|  1|   1|  28|147.36| 28.0|    IND|           27.0|                     54|             2|135.58|
|V Kohli|   2011|  2|   2|  18|105.88|  9.0|    IND|           18.0|                     72|             4|125.68|
|V Kohli|2011/12|  4|   3|  68|111.47|22.66|    IND|           20.0|                    140|             7|122.13|
|V Kohli|   2012|  2|   2| 138|155.05| 69.0|    IND|          30.89|                    278|             9|128.71|
|V Kohli|2012/13|  9|   9| 280|126.69| 35.0|    IND|          32.94|            

In [7]:
country_codes = {
    'LES': 'Lesotho',
    'BUL': 'Bulgaria',
    'VAN': 'Vanuatu',
    'ROM': 'Romania',
    'Aut': 'Austria',
    'COK': 'Cook Islands',
    'Fran': 'France',
    'SRB': 'Serbia',
    'PAK': 'Pakistan',
    'HUN': 'Hungary',
    'CYP': 'Cyprus',
    'Fiji': 'Fiji',
    'FIN': 'Finland',
    'EST': 'Estonia',
    'CHN': 'China',
    'GRC': 'Greece',
    'CAM': 'Cambodia',
    'GUE': 'Guernsey',
    'SEY': 'Seychelles',
    'JPN': 'Japan',
    'TAN': 'Tanzania',
    'JER': 'Jersey',
    'QAT': 'Qatar',
    'ENG': 'England',
    'UGA': 'Uganda',
    'BER': 'Bermuda',
    'CZK-R': 'Czech Republic',
    'CAY': 'Cayman Islands',
    'IRE': 'Ireland',
    'Mali': 'Mali',
    'BRA': 'Brazil',
    'SUI': 'Switzerland',
    'Peru': 'Peru',
    'Mex': 'Mexico',
    'MOZ': 'Mozambique',
    'Samoa': 'Samoa',
    'HKG': 'Hong Kong',
    'BAN': 'Bangladesh',
    'SL': 'Sri Lanka',
    'PNG': 'Papua New Guinea',
    'ZIM': 'Zimbabwe',
    'GHA': 'Ghana',
    'SWZ': 'Eswatini',  # Swaziland's official name now is Eswatini
    'MYAN': 'Myanmar',
    'IND': 'India',
    'USA': 'United States of America',
    'NEP': 'Nepal',
    'AFG': 'Afghanistan',
    'PAN': 'Panama',
    'NGA': 'Nigeria',
    'SLE': 'Sierra Leone',
    'ESP': 'Spain',
    'Bhm': 'Bahamas',
    'TKY': 'Turkey',
    'MWI': 'Malawi',
    'WI': 'West Indies',
    'IOM': 'Isle of Man',
    'THA': 'Thailand',
    'SWA': 'Eswatini',  # another code for Eswatini
    'SKOR': 'South Korea',
    'GMB': 'Gambia',
    'ISR': 'Israel',
    'KUW': 'Kuwait',
    'Belg': 'Belgium',
    'GER': 'Germany',
    'ITA': 'Italy',
    'CAN': 'Canada',
    'MDV': 'Maldives',
    'Blz': 'Belize',
    'DEN': 'Denmark',
    'INA': 'Indonesia',
    'KENYA': 'Kenya',
    'LUX': 'Luxembourg',
    'STHEL': 'Saint Helena',
    'BHR': 'Bahrain',
    'KSA': 'Saudi Arabia',
    'MLT': 'Malta',
    'Arg': 'Argentina',
    'MNG': 'Mongolia',
    'AUS': 'Australia',
    'GIBR': 'Gibraltar',
    'SGP': 'Singapore',
    'Chile': 'Chile',
    'UAE': 'United Arab Emirates',
    'NZ': 'New Zealand',
    'SCOT': 'Scotland',
    'BHU': 'Bhutan',
    'MAS': 'Malaysia',
    'BOT': 'Botswana',
    'CRC': 'Costa Rica',
    'PHI': 'Philippines',
    'NAM': 'Namibia',
    'RWN': 'Rwanda',
    'OMA': 'Oman',
    'NOR': 'Norway',
    'CRT': 'Croatia',
    'SWE': 'Sweden',
    'Iran': 'Iran',
    'PORT': 'Portugal',
    'NED': 'Netherlands',
    'SA': 'South Africa',
    'SVN': 'Slovenia',
    'GUE': 'Guernsey',
    'MDV': 'Maldives',
    'BHM': 'Bahamas',
    'SWE': 'Sweden',
    'MLT': 'Malta',
    'ITA': 'Italy',
}

# ICC and World teams
icc_world = {
    'ICC/PAK': 'Pakistan',
    'ICC/SL': 'Sri Lanka',
    'ICC/IND': 'India',
    'ICC/NEP': 'Nepal',
    'BAN/ICC': 'Bangladesh',
    'AFG/ICC': 'Afghanistan',
    'SL/World': 'Sri Lanka',
    'SA/World': 'South Africa',
    'AUS/World': 'Australia',
    'BAN/World': 'Bangladesh',
    'WI/World': 'West Indies',
}

# Outlier/Miscellaneous Countries
outlier_countries = {
    '1': 'Miscellaneous Country 1',
    '2': 'Miscellaneous Country 2',
    '3': 'Miscellaneous Country 3',
    'ICC': 'International Cricket Council',
    'World': 'World XI',
}

# Filtered country codes excluding ICC, World teams, and miscellaneous
filtered_countries = {
    code: country
    for code, country in country_codes.items()
    if code not in icc_world and code not in outlier_countries
}
len(country_codes), len(icc_world), len(outlier_countries), len(filtered_countries)

(103, 11, 5, 103)

In [8]:
batting_data = batting_data.filter(col('Country').isin(list(filtered_countries.keys())))
batting_data.show(5)

+----------+-------+---+----+----+-----+----+-------+---------------+-----------------------+--------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave|Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+----------+-------+---+----+----+-----+----+-------+---------------+-----------------------+--------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|    BUL|            8.0|                     16|             2| 100.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|    BUL|            8.0|                     24|             3| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|    BUL|            6.5|                     26|             4| 76.19|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|    BUL|           4.43|                     31|             7| 66.76|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|    BUL|           4.43|                     31|             7| 53.41|
+----------+-------+---+----+----+-----+----+-------+---------------+---

In [9]:
# use map
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Broadcast the country_codes dictionary
country_codes_broadcast = spark.sparkContext.broadcast(country_codes)

def get_country_name(code):
    return country_codes_broadcast.value.get(code, "Unknown")

country_udf = udf(get_country_name, StringType())
batting_data = batting_data.withColumn("Country", country_udf(col("Country")))
batting_data.show(5)

+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave| Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|Bulgaria|            8.0|                     16|             2| 100.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|Bulgaria|            8.0|                     24|             3| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|Bulgaria|            6.5|                     26|             4| 76.19|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|Bulgaria|           4.43|                     31|             7| 66.76|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|Bulgaria|           4.43|                     31|             7| 53.41|
+----------+-------+---+----+----+-----+----+--------+----------

In [10]:
batting_data = batting_data.replace(filtered_countries,subset=['Country'])
batting_data.show(5)

+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave| Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|Bulgaria|            8.0|                     16|             2| 100.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|Bulgaria|            8.0|                     24|             3| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|Bulgaria|            6.5|                     26|             4| 76.19|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|Bulgaria|           4.43|                     31|             7| 66.76|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|Bulgaria|           4.43|                     31|             7| 53.41|
+----------+-------+---+----+----+-----+----+--------+----------

In [11]:
batting_data[['Player','Country']].distinct().count(), batting_data[['Player']].distinct().count(), batting_data.distinct().count()

(4129, 4074, 14035)

In [13]:
batting_data.show(5)

+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|    Player| Season|Mat|Inns|Runs|   SR| Ave| Country|Cum batting Ave|Cum battings Runs Total|Cum Inns Total|Cum SR|
+----------+-------+---+----+----+-----+----+--------+---------------+-----------------------+--------------+------+
|A Ahmadhel|2019/20|  3|   2|  16|100.0| 8.0|Bulgaria|            8.0|                     16|             2| 100.0|
|A Ahmadhel|   2020|  1|   1|   8|100.0| 8.0|Bulgaria|            8.0|                     24|             3| 100.0|
|A Ahmadhel|2020/21|  2|   1|   2|28.57| 2.0|Bulgaria|            6.5|                     26|             4| 76.19|
|A Ahmadhel|   2021|  3|   3|   5|38.46|1.66|Bulgaria|           4.43|                     31|             7| 66.76|
|A Ahmadhel|   2023|  2|   0|   0|  0.0| 0.0|Bulgaria|           4.43|                     31|             7| 53.41|
+----------+-------+---+----+----+-----+----+--------+----------