In [7]:
import os

# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\1_rawData' 

# sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()

team_data = spark.read.csv(os.path.join(directory, 't20_team_stats.csv'), header=True, inferSchema=True)
team_data.show()

+--------+---+---+----+----+---+-----+-----+-----+----+---+---+-------+
|    Team|Mat|Won|Lost|Tied| NR|  W/L|  Ave|  RPO|Inns| HS| LS| Season|
+--------+---+---+----+----+---+-----+-----+-----+----+---+---+-------+
|Zimbabwe|  5|  5|   0|   0|  0|    -| 43.6|12.82|   5|344|  -|2024/25|
|Zimbabwe| 10|  2|   8|   0|  0|0.250|17.48| 7.16|  10|159|124|   2024|
|Zimbabwe| 17|  8|   9|   0|  0|0.888|24.03| 7.81|  17|217| 82|2023/24|
|Zimbabwe| 11|  5|   5|   0|  1|1.000|16.94| 7.13|  11|174|115|2022/23|
|Zimbabwe| 16|  9|   7|   0|  0|1.285|22.45| 7.75|  16|236| 95|   2022|
|Zimbabwe| 14|  6|   8|   0|  0|0.750| 20.6| 7.22|  14|193|138|   2021|
|Zimbabwe|  6|  0|   6|   0|  0|0.000|19.38| 7.28|   6|156|148|2020/21|
|Zimbabwe|  6|  3|   3|   0|  0|1.000|24.56| 7.98|   6|177|152|2019/20|
|Zimbabwe|  8|  2|   5|   1|  0|0.400|22.01| 8.23|   8|172|136|   2019|
|Zimbabwe|  2|  0|   2|   0|  0|0.000|15.17| 6.91|   2|132|126|2018/19|
|Zimbabwe|  4|  0|   4|   0|  0|0.000|17.74| 7.06|   4|162|108| 

In [8]:
from pyspark.sql.functions import col,when,round
team_data = team_data.withColumn("W/L", round(when(col("Lost")==0, col("Won")).otherwise(col("Won")/col("Lost")),2))
team_data = team_data.withColumn("AveRPW", when(col("Ave")=='-',0).otherwise(col("Ave")).cast("float")).drop("Ave")
team_data = team_data.withColumn("AveRPO", when(col("RPO")=='-',0).otherwise(col("RPO")).cast("float")).drop("RPO","LS")
team_data.show()

+--------+---+---+----+----+---+----+----+---+-------+------+------+
|    Team|Mat|Won|Lost|Tied| NR| W/L|Inns| HS| Season|AveRPW|AveRPO|
+--------+---+---+----+----+---+----+----+---+-------+------+------+
|Zimbabwe|  5|  5|   0|   0|  0| 5.0|   5|344|2024/25|  43.6| 12.82|
|Zimbabwe| 10|  2|   8|   0|  0|0.25|  10|159|   2024| 17.48|  7.16|
|Zimbabwe| 17|  8|   9|   0|  0|0.89|  17|217|2023/24| 24.03|  7.81|
|Zimbabwe| 11|  5|   5|   0|  1| 1.0|  11|174|2022/23| 16.94|  7.13|
|Zimbabwe| 16|  9|   7|   0|  0|1.29|  16|236|   2022| 22.45|  7.75|
|Zimbabwe| 14|  6|   8|   0|  0|0.75|  14|193|   2021|  20.6|  7.22|
|Zimbabwe|  6|  0|   6|   0|  0| 0.0|   6|156|2020/21| 19.38|  7.28|
|Zimbabwe|  6|  3|   3|   0|  0| 1.0|   6|177|2019/20| 24.56|  7.98|
|Zimbabwe|  8|  2|   5|   1|  0| 0.4|   8|172|   2019| 22.01|  8.23|
|Zimbabwe|  2|  0|   2|   0|  0| 0.0|   2|132|2018/19| 15.17|  6.91|
|Zimbabwe|  4|  0|   4|   0|  0| 0.0|   4|162|   2018| 17.74|  7.06|
|Zimbabwe|  2|  0|   2|   0|  0| 0

In [9]:
# Cumulative calculations
from pyspark.sql import Window
from pyspark.sql.functions import col, sum as spark_sum, when, row_number, round

# Define the window specification for cumulative calculations
window_spec = Window.partitionBy("Team").orderBy("Season").rowsBetween(Window.unboundedPreceding, -1)

# Window for row number to identify the first row per player and country
row_num_window = Window.partitionBy("Team").orderBy("Season")

# perform cumulative calculations
team_data = team_data.withColumn("row_num", row_number().over(row_num_window)) \
    .withColumn("Cumulative Won",
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("Won").over(window_spec))) \
    .withColumn("Cumulative Lost",
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Lost").over(window_spec))) \
    .withColumn("Cumulative Tied", 
                when(col("row_num") == 1, 0)  # Set 0 for the first row (before any match)
                .otherwise(spark_sum("Tied").over(window_spec))) \
    .withColumn("Cumulative NR", 
                when(col("row_num") == 1, 0)
                .otherwise(spark_sum("NR").over(window_spec))) \
    .withColumn("Cumulative W/L", 
                when(col("row_num") == 1, 0)
                .otherwise(
                    round(
                        when(spark_sum("Lost").over(window_spec) != 0, 
                             spark_sum(("Won")).over(window_spec) / spark_sum("Lost").over(window_spec))
                        .otherwise(0), 2)
                )
    ) \
    .withColumn("Cumulative AveRPW", 
                when(col("row_num") == 1, 0)
                .otherwise(
                    round(
                        when(spark_sum("Won").over(window_spec) != 0, 
                             spark_sum(col("AveRPW")*col("Mat")).over(window_spec) / spark_sum("Mat").over(window_spec))
                        .otherwise(0), 2)
                )
    ) \
    .withColumn("Cumulative AveRPO", 
                when(col("row_num") == 1, 0)
                .otherwise(
                    round(
                        when(spark_sum("Lost").over(window_spec) != 0, 
                             spark_sum(col("AveRPO")*col("Mat")).over(window_spec) / spark_sum("Mat").over(window_spec))
                        .otherwise(0), 2)
                )
    ) \
    .drop("row_num")  # Drop the temporary row number column

# Show the resulting DataFrame
team_data.show(10)

+-----------+---+---+----+----+---+----+----+---+-------+------+------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|       Team|Mat|Won|Lost|Tied| NR| W/L|Inns| HS| Season|AveRPW|AveRPO|Cumulative Won|Cumulative Lost|Cumulative Tied|Cumulative NR|Cumulative W/L|Cumulative AveRPW|Cumulative AveRPO|
+-----------+---+---+----+----+---+----+----+---+-------+------+------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|Afghanistan|  6|  4|   2|   0|  0| 2.0|   6|147|2009/10| 20.22|  6.89|             0|              0|              0|            0|           0.0|              0.0|              0.0|
|Afghanistan|  2|  0|   2|   0|  0| 0.0|   2|115|   2010| 10.83|  5.41|             4|              2|              0|            0|           2.0|            20.22|             6.89|
|Afghanistan|  3|  2|   1|   0|  0| 2.0|   3|174|2011/12| 22.66|  7.97|         

In [10]:
team_data = team_data.select("Team", "Season","Cumulative Won", "Cumulative Lost", "Cumulative Tied", "Cumulative NR", "Cumulative W/L", "Cumulative AveRPW", "Cumulative AveRPO")
team_data.show(5)

+-----------+-------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|       Team| Season|Cumulative Won|Cumulative Lost|Cumulative Tied|Cumulative NR|Cumulative W/L|Cumulative AveRPW|Cumulative AveRPO|
+-----------+-------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|Afghanistan|2009/10|             0|              0|              0|            0|           0.0|              0.0|              0.0|
|Afghanistan|   2010|             4|              2|              0|            0|           2.0|            20.22|             6.89|
|Afghanistan|2011/12|             4|              4|              0|            0|           1.0|            17.87|             6.52|
|Afghanistan|2012/13|             6|              5|              0|            0|           1.2|            19.18|             6.92|
|Afghanistan|2013/14|             8|              7|          

In [11]:
team_data.toPandas().to_csv(r'D:\github\Cricket-Prediction\data\2_processedData\teamStats.csv', index=False)