In [16]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..','..','..'))
import config, utils

# Create a Spark session
spark = utils.create_spark_session("bowling", {
    'spark.executor.memory': '4g',
    'spark.executor.cores': '4',
})

bowling_data = utils.load_data(spark,config.RAW_DATA_DIR, 't20_bowling_stats.csv')

bowling_data.show(5)

[[34m2024-11-15T01:29:15.805+0530[0m] {[34mutils.py:[0m12} INFO[0m - Creating Spark session.[0m


24/11/15 01:30:20 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


[[34m2024-11-15T01:30:20.525+0530[0m] {[34mutils.py:[0m30} INFO[0m - Spark session created successfully.[0m
[[34m2024-11-15T01:30:20.527+0530[0m] {[34mutils.py:[0m39} INFO[0m - Loading data from t20_bowling_stats.csv.[0m


24/11/15 01:30:37 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[Stage 2:>                                                          (0 + 1) / 1]

+-------------------+---+----+-----+----+----+----+----+-----+-----+----+---+---+-------+
|             Player|Mat|Inns|Overs|Mdns|Runs|Wkts| BBI|  Ave| Econ|  SR|  4|  5| Season|
+-------------------+---+----+-----+----+----+----+----+-----+-----+----+---+---+-------+
|      AR Adams (NZ)|  1|   1|  4.0|   0|  40|   0|   -|    -|10.00|   -|  0|  0|2004/05|
|     CL Cairns (NZ)|  1|   1|  4.0|   0|  28|   1|1/28|28.00| 7.00|24.0|  0|  0|2004/05|
|    MJ Clarke (AUS)|  1|   1|  2.0|   0|  11|   0|   -|    -| 5.50|   -|  0|  0|2004/05|
|     JR Hopes (AUS)|  1|   1|  3.0|   0|  23|   1|1/23|23.00| 7.66|18.0|  0|  0|2004/05|
|MS Kasprowicz (AUS)|  1|   1|  4.0|   0|  29|   4|4/29| 7.25| 7.25| 6.0|  1|  0|2004/05|
+-------------------+---+----+-----+----+----+----+----+-----+-----+----+---+---+-------+
only showing top 5 rows



                                                                                

In [17]:
bowling_data = bowling_data.select(["Player","Season","Mat","Inns",'Overs',"Runs","Wkts","Econ"]).sort(["Player","Season"])
bowling_data.show(5)

[Stage 3:>                                                          (0 + 1) / 1]

+----------------+-------+---+----+-----+----+----+-----+
|          Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|
+----------------+-------+---+----+-----+----+----+-----+
|A Ahmadhel (BUL)|2019/20|  3|   3| 10.0|  75|   3|  7.5|
|A Ahmadhel (BUL)|   2020|  1|   1|  2.0|  22|   1|11.00|
|A Ahmadhel (BUL)|2020/21|  2|   2|  2.4|  27|   2|10.12|
|A Ahmadhel (BUL)|   2021|  3|   2|  3.0|  31|   0|10.33|
|A Ahmadhel (BUL)|   2023|  2|   -|    -|   -|   -|    -|
+----------------+-------+---+----+-----+----+----+-----+
only showing top 5 rows



                                                                                

In [18]:
from pyspark.sql.functions import when, col
bowling_data = bowling_data.withColumn("Overs", when(col("Overs") == "-", "0").otherwise(col("Overs")).cast("float"))
bowling_data = bowling_data.withColumn("Wkts", when(col("Wkts") == "-", "0").otherwise(col("Wkts")).cast("float"))
bowling_data = bowling_data.withColumn("Inns", when(col("Inns") == "-", "0").otherwise(col("Inns")).cast("float"))
bowling_data = bowling_data.withColumn("Runs", when(col("Runs") == "-", "0").otherwise(col("Runs")).cast("float"))
bowling_data = bowling_data.withColumn("Econ", when(col("Econ") == "-", col("Runs")/col("Inns")).otherwise(col("Econ")).cast("float")).fillna(0)
bowling_data.show(5)

[Stage 7:>                                                          (0 + 1) / 1]

+----------------+-------+---+----+-----+----+----+-----+
|          Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|
+----------------+-------+---+----+-----+----+----+-----+
|A Ahmadhel (BUL)|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|
|A Ahmadhel (BUL)|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|
|A Ahmadhel (BUL)|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|
|A Ahmadhel (BUL)|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33|
|A Ahmadhel (BUL)|   2023|  2| 0.0|  0.0| 0.0| 0.0|  0.0|
+----------------+-------+---+----+-----+----+----+-----+
only showing top 5 rows



                                                                                

In [19]:
from pyspark.sql.functions import regexp_extract, regexp_replace

# Extract the country name between '(' and ')'
bowling_data = bowling_data.withColumn("Country", regexp_extract(col("Player"), r"\((.*?)\)", 1))

# Extract the player's name before the first '('
bowling_data = bowling_data.withColumn("Player", regexp_extract(col("Player"), r"^(.*?)\s\(", 1))

# Show the result
bowling_data.show()

                                                                                

+--------------+-------+---+----+-----+-----+----+-----+-------+
|        Player| Season|Mat|Inns|Overs| Runs|Wkts| Econ|Country|
+--------------+-------+---+----+-----+-----+----+-----+-------+
|    A Ahmadhel|2019/20|  3| 3.0| 10.0| 75.0| 3.0|  7.5|    BUL|
|    A Ahmadhel|   2020|  1| 1.0|  2.0| 22.0| 1.0| 11.0|    BUL|
|    A Ahmadhel|2020/21|  2| 2.0|  2.4| 27.0| 2.0|10.12|    BUL|
|    A Ahmadhel|   2021|  3| 2.0|  3.0| 31.0| 0.0|10.33|    BUL|
|    A Ahmadhel|   2023|  2| 0.0|  0.0|  0.0| 0.0|  0.0|    BUL|
|    A Ahmadhel|   2024|  1| 0.0|  0.0|  0.0| 0.0|  0.0|    BUL|
|   A Alexander|2024/25|  1| 1.0|  3.3| 19.0| 2.0| 5.42|    INA|
|       A Amado|   2022|  3| 3.0|  6.0| 58.0| 3.0| 9.66|    ISR|
|     A Andrews|2021/22|  3| 3.0| 10.0| 61.0| 4.0|  6.1|    SUI|
|     A Andrews|   2022|  4| 1.0|  1.0| 21.0| 0.0| 21.0|    SUI|
|A Anemogiannis|2019/20|  1| 0.0|  0.0|  0.0| 0.0|  0.0|    GRC|
|       A Ashok|   2023|  1| 1.0|  4.0| 28.0| 1.0|  7.0|     NZ|
|     A Ashokan|2019/20| 

In [20]:
from pyspark.sql import Window
from pyspark.sql.functions import col, sum as spark_sum, when, row_number, round

# Define the window specification for cumulative calculations excluding the current row
window_spec = Window.partitionBy("Player", "Country").orderBy("Season").rowsBetween(Window.unboundedPreceding, -1)

# Window for row number to identify the first row per player and country
row_num_window = Window.partitionBy("Player", "Country").orderBy("Season")

# Perform cumulative calculations with conditions
bowling_data = bowling_data.withColumn("row_num", row_number().over(row_num_window)) \
    .withColumn("Cumulative Mat",
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Mat").over(window_spec))) \
    .withColumn("Cumulative Inns",
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Inns").over(window_spec))) \
    .withColumn("Cumulative Overs", 
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Overs").over(window_spec))) \
    .withColumn("Cumulative Runs", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Runs").over(window_spec))) \
    .withColumn("Cumulative Wkts", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Wkts").over(window_spec))) \
    .withColumn(
        "Cumulative Econ", 
        when(col("row_num") == 1, 0)
        .otherwise(
            round(
                when(spark_sum("Inns").over(window_spec) != 0, 
                     spark_sum(col("Inns")*col("Econ")).over(window_spec) / spark_sum("Inns").over(window_spec))
                .otherwise(0), 2)
        )
    ) \
    .drop("row_num")  # Drop the temporary row number column

# Show the resulting DataFrame
bowling_data.show(5)

[Stage 15:>                                                         (0 + 1) / 1]

+----------+-------+---+----+-----+----+----+-----+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|    Player| Season|Mat|Inns|Overs|Runs|Wkts| Econ|Country|Cumulative Mat|Cumulative Inns|  Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|
+----------+-------+---+----+-----+----+----+-----+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|A Ahmadhel|2019/20|  3| 3.0| 10.0|75.0| 3.0|  7.5|    BUL|             0|            0.0|               0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|   2020|  1| 1.0|  2.0|22.0| 1.0| 11.0|    BUL|             3|            3.0|              10.0|           75.0|            3.0|            7.5|
|A Ahmadhel|2020/21|  2| 2.0|  2.4|27.0| 2.0|10.12|    BUL|             4|            4.0|              12.0|           97.0|            4.0|           8.38|
|A Ahmadhel|   2021|  3| 2.0|  3.0|31.0| 0.0|10.33| 

                                                                                

In [21]:
bowling_data = bowling_data.select(["Player","Country","Season","Cumulative Mat","Cumulative Inns","Cumulative Overs","Cumulative Runs","Cumulative Wkts","Cumulative Econ"])
bowling_data.show(5)

+----------+-------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|    Player|Country| Season|Cumulative Mat|Cumulative Inns|  Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|
+----------+-------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|A Ahmadhel|    BUL|2019/20|             0|            0.0|               0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|    BUL|   2020|             3|            3.0|              10.0|           75.0|            3.0|            7.5|
|A Ahmadhel|    BUL|2020/21|             4|            4.0|              12.0|           97.0|            4.0|           8.38|
|A Ahmadhel|    BUL|   2021|             6|            6.0|14.400000095367432|          124.0|            6.0|           8.96|
|A Ahmadhel|    BUL|   2023|             9|            8.0| 17.40000009536743|          155.0|            6.0| 

                                                                                

In [22]:
country_codes = {
    'LES': 'Lesotho',
    'BUL': 'Bulgaria',
    'VAN': 'Vanuatu',
    'ROM': 'Romania',
    'Aut': 'Austria',
    'COK': 'Cook Islands',
    'Fran': 'France',
    'SRB': 'Serbia',
    'PAK': 'Pakistan',
    'HUN': 'Hungary',
    'CYP': 'Cyprus',
    'Fiji': 'Fiji',
    'FIN': 'Finland',
    'EST': 'Estonia',
    'CHN': 'China',
    'GRC': 'Greece',
    'CAM': 'Cambodia',
    'GUE': 'Guernsey',
    'SEY': 'Seychelles',
    'JPN': 'Japan',
    'TAN': 'Tanzania',
    'JER': 'Jersey',
    'QAT': 'Qatar',
    'ENG': 'England',
    'UGA': 'Uganda',
    'BER': 'Bermuda',
    'CZK-R': 'Czech Republic',
    'CAY': 'Cayman Islands',
    'IRE': 'Ireland',
    'Mali': 'Mali',
    'BRA': 'Brazil',
    'SUI': 'Switzerland',
    'Peru': 'Peru',
    'Mex': 'Mexico',
    'MOZ': 'Mozambique',
    'Samoa': 'Samoa',
    'HKG': 'Hong Kong',
    'BAN': 'Bangladesh',
    'SL': 'Sri Lanka',
    'PNG': 'Papua New Guinea',
    'ZIM': 'Zimbabwe',
    'GHA': 'Ghana',
    'SWZ': 'Eswatini',  # Swaziland's official name now is Eswatini
    'MYAN': 'Myanmar',
    'IND': 'India',
    'USA': 'United States of America',
    'NEP': 'Nepal',
    'AFG': 'Afghanistan',
    'PAN': 'Panama',
    'NGA': 'Nigeria',
    'SLE': 'Sierra Leone',
    'ESP': 'Spain',
    'Bhm': 'Bahamas',
    'TKY': 'Turkey',
    'MWI': 'Malawi',
    'WI': 'West Indies',
    'IOM': 'Isle of Man',
    'THA': 'Thailand',
    'SWA': 'Eswatini',  # another code for Eswatini
    'SKOR': 'South Korea',
    'GMB': 'Gambia',
    'ISR': 'Israel',
    'KUW': 'Kuwait',
    'Belg': 'Belgium',
    'GER': 'Germany',
    'ITA': 'Italy',
    'CAN': 'Canada',
    'MDV': 'Maldives',
    'Blz': 'Belize',
    'DEN': 'Denmark',
    'INA': 'Indonesia',
    'KENYA': 'Kenya',
    'LUX': 'Luxembourg',
    'STHEL': 'Saint Helena',
    'BHR': 'Bahrain',
    'KSA': 'Saudi Arabia',
    'MLT': 'Malta',
    'Arg': 'Argentina',
    'MNG': 'Mongolia',
    'AUS': 'Australia',
    'GIBR': 'Gibraltar',
    'SGP': 'Singapore',
    'Chile': 'Chile',
    'UAE': 'United Arab Emirates',
    'NZ': 'New Zealand',
    'SCOT': 'Scotland',
    'BHU': 'Bhutan',
    'MAS': 'Malaysia',
    'BOT': 'Botswana',
    'CRC': 'Costa Rica',
    'PHI': 'Philippines',
    'NAM': 'Namibia',
    'RWN': 'Rwanda',
    'OMA': 'Oman',
    'NOR': 'Norway',
    'CRT': 'Croatia',
    'SWE': 'Sweden',
    'Iran': 'Iran',
    'PORT': 'Portugal',
    'NED': 'Netherlands',
    'SA': 'South Africa',
    'SVN': 'Slovenia',
    'GUE': 'Guernsey',
    'MDV': 'Maldives',
    'BHM': 'Bahamas',
    'SWE': 'Sweden',
    'MLT': 'Malta',
    'ITA': 'Italy',
}

# ICC and World teams
icc_world = {
    'ICC/PAK': 'Pakistan',
    'ICC/SL': 'Sri Lanka',
    'ICC/IND': 'India',
    'ICC/NEP': 'Nepal',
    'BAN/ICC': 'Bangladesh',
    'AFG/ICC': 'Afghanistan',
    'SL/World': 'Sri Lanka',
    'SA/World': 'South Africa',
    'AUS/World': 'Australia',
    'BAN/World': 'Bangladesh',
    'WI/World': 'West Indies',
}

# Outlier/Miscellaneous Countries
outlier_countries = {
    '1': 'Miscellaneous Country 1',
    '2': 'Miscellaneous Country 2',
    '3': 'Miscellaneous Country 3',
    'ICC': 'International Cricket Council',
    'World': 'World XI',
}

# Filtered country codes excluding ICC, World teams, and miscellaneous
filtered_countries = {
    code: country
    for code, country in country_codes.items()
    if code not in icc_world and code not in outlier_countries
}
len(country_codes), len(icc_world), len(outlier_countries), len(filtered_countries)

(103, 11, 5, 103)

In [23]:
bowling_data = bowling_data.filter(col('Country').isin(list(filtered_countries.keys())))
bowling_data.show(5)

+----------+-------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|    Player|Country| Season|Cumulative Mat|Cumulative Inns|  Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|
+----------+-------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|A Ahmadhel|    BUL|2019/20|             0|            0.0|               0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|    BUL|   2020|             3|            3.0|              10.0|           75.0|            3.0|            7.5|
|A Ahmadhel|    BUL|2020/21|             4|            4.0|              12.0|           97.0|            4.0|           8.38|
|A Ahmadhel|    BUL|   2021|             6|            6.0|14.400000095367432|          124.0|            6.0|           8.96|
|A Ahmadhel|    BUL|   2023|             9|            8.0| 17.40000009536743|          155.0|            6.0| 

In [24]:
bowling_data = bowling_data.replace(filtered_countries,subset=['Country'])
bowling_data.show(5)

+----------+--------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|    Player| Country| Season|Cumulative Mat|Cumulative Inns|  Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|
+----------+--------+-------+--------------+---------------+------------------+---------------+---------------+---------------+
|A Ahmadhel|Bulgaria|2019/20|             0|            0.0|               0.0|            0.0|            0.0|            0.0|
|A Ahmadhel|Bulgaria|   2020|             3|            3.0|              10.0|           75.0|            3.0|            7.5|
|A Ahmadhel|Bulgaria|2020/21|             4|            4.0|              12.0|           97.0|            4.0|           8.38|
|A Ahmadhel|Bulgaria|   2021|             6|            6.0|14.400000095367432|          124.0|            6.0|           8.96|
|A Ahmadhel|Bulgaria|   2023|             9|            8.0| 17.40000009536743|          155.0|         

In [25]:
players_data = utils.load_data(spark,config.PROCESSED_DATA_DIR,'match_players.csv')
players_data = players_data.withColumnRenamed("player", "Player").withColumnRenamed("country", "Country").withColumnRenamed("season", "Season")
print(players_data[['Player',"Country","Season"]].distinct().count())
players_data.show()

[[34m2024-11-15T01:35:30.908+0530[0m] {[34mutils.py:[0m39} INFO[0m - Loading data from match_players.csv.[0m


                                                                                

19755
+---------+-------------+---------+------+--------+
|  Country|       Player|player_id|Season|match_id|
+---------+-------------+---------+------+--------+
|  England|       JJ Roy| d1c36f5c|  2018| 1119542|
|  England|   JC Buttler| 99b75528|  2018| 1119542|
|  England|   EJG Morgan| d2a6c0e6|  2018| 1119542|
|  England|     AD Hales| 92aeac25|  2018| 1119542|
|  England|      JE Root| a343262c|  2018| 1119542|
|  England|  JM Bairstow| abb83e27|  2018| 1119542|
|  England|       MM Ali| bb351c23|  2018| 1119542|
|  England|    DJ Willey| 7f048519|  2018| 1119542|
|  England|  LE Plunkett| 9a963804|  2018| 1119542|
|  England|    CJ Jordan| ffe699c0|  2018| 1119542|
|  England|    AU Rashid| 249d60c9|  2018| 1119542|
|Australia|    DJM Short| 1a156c88|  2018| 1119542|
|Australia|     AJ Finch| b8d490fd|  2018| 1119542|
|Australia|   GJ Maxwell| b681e71e|  2018| 1119542|
|Australia|      TM Head| 12b610c2|  2018| 1119542|
|Australia|     AT Carey| 69d03465|  2018| 1119542|
|Austr

In [26]:
bowling_data = bowling_data.join(players_data, ['Player', 'Country', "Season"], 'inner')
bowling_data = bowling_data.select(['player_id', 'Player', 'Country', "Season","Cumulative Mat", "Cumulative Inns", 'Cumulative Overs','Cumulative Runs','Cumulative Wkts','Cumulative Econ']).withColumnRenamed("Cumulative Runs","Cumulative Bowling Runs")
print(bowling_data.count())
bowling_data.show(5)

                                                                                

53406


[Stage 60:>                                                         (0 + 1) / 1]

+---------+----------+-------+------+--------------+---------------+----------------+-----------------------+---------------+---------------+
|player_id|    Player|Country|Season|Cumulative Mat|Cumulative Inns|Cumulative Overs|Cumulative Bowling Runs|Cumulative Wkts|Cumulative Econ|
+---------+----------+-------+------+--------------+---------------+----------------+-----------------------+---------------+---------------+
| d1c36f5c|    JJ Roy|England|  2018|            27|            0.0|             0.0|                    0.0|            0.0|            0.0|
| 99b75528|JC Buttler|England|  2018|            61|            0.0|             0.0|                    0.0|            0.0|            0.0|
| d2a6c0e6|EJG Morgan|England|  2018|            72|            0.0|             0.0|                    0.0|            0.0|            0.0|
| 92aeac25|  AD Hales|England|  2018|            52|            0.0|             0.0|                    0.0|            0.0|            0.0|
| a343

                                                                                

In [27]:
bowling_data[['Player','Country']].distinct().count(), bowling_data[['Player']].distinct().count(), bowling_data[['Player','Country',"Season"]].distinct().count()

                                                                                

(3640, 3594, 12383)

In [28]:
utils.spark_save_data(bowling_data, config.PROCESSED_DATA_DIR, 'bowling_data.csv')
spark.stop()

[Stage 89:>                                                         (0 + 1) / 1]

[[34m2024-11-15T01:37:42.120+0530[0m] {[34mutils.py:[0m57} INFO[0m - Successfully wrote data to /usr/ravi/t20/data/2_processedData/bowling_data.csv[0m


                                                                                