In [51]:
import os
directory = r'D:\github\Cricket-Prediction\data\2_processedData'

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()

matches = spark.read.csv(os.path.join(directory, 'matches.csv'), inferSchema=True, header=True)
deliveries = spark.read.parquet(os.path.join(directory, 'deliveries.parquet'), inferSchema=True, header=True)

In [52]:
matches = matches.drop('date','city','toss_winner','toss_decision')
matches.show(5)

+---------+---------+------+-------+--------------------+---------+--------+
|    team1|    team2|gender| season|               venue|   winner|match_id|
+---------+---------+------+-------+--------------------+---------+--------+
|Australia|Sri Lanka|  male|2016/17|Melbourne Cricket...|Sri Lanka| 1001349|
|Australia|Sri Lanka|  male|2016/17|Simonds Stadium, ...|Sri Lanka| 1001351|
|Australia|Sri Lanka|  male|2016/17|       Adelaide Oval|Australia| 1001353|
|  Ireland|Hong Kong|  male|   2016|Bready Cricket Cl...|Hong Kong| 1004729|
| Zimbabwe|    India|  male|   2016|  Harare Sports Club| Zimbabwe| 1007655|
+---------+---------+------+-------+--------------------+---------+--------+
only showing top 5 rows



In [53]:
deliveries = deliveries.drop('season','start_date','venue','striker','non_striker','bowler')
deliveries.show(5)

+--------+-------+----+------------+------------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id|innings|ball|batting_team|bowling_team|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+----+------------+------------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|      1| 0.1|     Bahrain|Saudi Arabia|           0|     0|    0|      0|   0|      0|      0|          0|               0|                0|                     0|
| 1306389|      1| 0.2|     Bahrain|Saudi Arabia|           0|     1|    1|      0|   0|      0|      0|          0|               0|                0|                     0|
| 1306389|      1| 0.3|     Bahrain|Saudi Arabia|           0|     1|    1|      0|   0|      0|      0|          0|         

In [54]:
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, col, lit, sum as F_sum

# Calculate "runs" as the row-wise sum of specified columns
deliveries = deliveries.withColumn(
    "runs",
    coalesce(col("runs_off_bat"), lit(0)) +
    coalesce(col("extras"), lit(0)) +
    coalesce(col("wides"), lit(0)) +
    coalesce(col("noballs"), lit(0)) +
    coalesce(col("byes"), lit(0)) +
    coalesce(col("legbyes"), lit(0)) +
    coalesce(col("penalty"), lit(0))
)

# Drop the original columns that were summed
deliveries = deliveries.drop("runs_off_bat", "extras", "wides", "noballs", "byes", "legbyes", "penalty")

# Calculate "wickets" as the row-wise sum of dismissals, handling null values
deliveries = deliveries.withColumn(
    "wickets",
    (coalesce(col("player_dismissed").cast("int"), lit(0)) +
     coalesce(col("other_player_dismissed").cast("int"), lit(0)))
)

# Drop columns related to wicket types and dismissed players that are no longer needed
deliveries = deliveries.drop("wicket_type", "player_dismissed", "other_wicket_type", "other_player_dismissed")
deliveries.show(5)

+--------+-------+----+------------+------------+----+-------+
|match_id|innings|ball|batting_team|bowling_team|runs|wickets|
+--------+-------+----+------------+------------+----+-------+
| 1306389|      1| 0.1|     Bahrain|Saudi Arabia|   0|      0|
| 1306389|      1| 0.2|     Bahrain|Saudi Arabia|   2|      0|
| 1306389|      1| 0.3|     Bahrain|Saudi Arabia|   2|      0|
| 1306389|      1| 0.4|     Bahrain|Saudi Arabia|   4|      0|
| 1306389|      1| 0.5|     Bahrain|Saudi Arabia|   0|      0|
+--------+-------+----+------------+------------+----+-------+
only showing top 5 rows



In [55]:
#Define the window specifications for cumulative sums partitioned by "match_id" and "innings"
window_spec = Window.partitionBy("match_id", "innings").orderBy("ball")

# Calculate cumulative sum for "runs" as "curr_score"
deliveries = deliveries.withColumn(
    "curr_score",
    F_sum("runs").over(window_spec)
)

# Calculate cumulative sum for "wickets" as "curr_wickets"
deliveries = deliveries.withColumn(
    "curr_wickets",
    F_sum("wickets").over(window_spec)
)

# Drop intermediate columns if they are no longer needed
deliveries = deliveries.drop("runs", "wickets")

# Display the resulting DataFrame
deliveries.show(250)

+--------+-------+----+------------+------------+----------+------------+
|match_id|innings|ball|batting_team|bowling_team|curr_score|curr_wickets|
+--------+-------+----+------------+------------+----------+------------+
|  211048|      2| 0.1| New Zealand|   Australia|         0|           0|
|  211048|      2| 0.2| New Zealand|   Australia|         0|           0|
|  211048|      2| 0.3| New Zealand|   Australia|         1|           0|
|  211048|      2| 0.4| New Zealand|   Australia|         1|           0|
|  211048|      2| 0.5| New Zealand|   Australia|         2|           0|
|  211048|      2| 0.6| New Zealand|   Australia|         2|           0|
|  211048|      2| 1.1| New Zealand|   Australia|         6|           0|
|  211048|      2| 1.2| New Zealand|   Australia|         7|           0|
|  211048|      2| 1.3| New Zealand|   Australia|         8|           0|
|  211048|      2| 1.4| New Zealand|   Australia|        10|           0|
|  211048|      2| 1.5| New Zealand|  

In [56]:
data=deliveries.join(matches,on='match_id').drop('season','venue','gender')
data.sort('match_id').show(10)

+--------+-------+----+------------+------------+----------+------------+-------+---------+-------+
|match_id|innings|ball|batting_team|bowling_team|curr_score|curr_wickets|  team1|    team2| winner|
+--------+-------+----+------------+------------+----------+------------+-------+---------+-------+
|  211028|      2| 0.1|   Australia|     England|         0|           0|England|Australia|England|
|  211028|      1| 0.1|     England|   Australia|         0|           0|England|Australia|England|
|  211028|      2| 0.2|   Australia|     England|         4|           0|England|Australia|England|
|  211028|      1| 0.2|     England|   Australia|         1|           0|England|Australia|England|
|  211028|      2| 0.3|   Australia|     England|         4|           0|England|Australia|England|
|  211028|      1| 0.3|     England|   Australia|         1|           0|England|Australia|England|
|  211028|      2| 0.4|   Australia|     England|         4|           0|England|Australia|England|


In [57]:
from pyspark.sql import functions as F

# Create data1 by adding the 'flip' column as a literal 0
data1 = data.withColumn("flip", F.lit(0))

# Swap the columns team1 and team2 for data2
data2 = data.withColumnRenamed("team1", "team_temp") \
            .withColumnRenamed("team2", "team1") \
            .withColumnRenamed("team_temp", "team2")

# Select necessary columns for data2
data2 = data2.select('match_id', 'innings', 'ball', 'batting_team', 'bowling_team', 
                     'curr_score', 'curr_wickets', 'team1', 'team2', 'winner')

# Add the 'flip' column as a literal 1 to data2
data2 = data2.withColumn("flip", F.lit(1))


# Concatenate data1 and data2, sort by match_id, and add 'won' column based on team1 winning
data_combined = data1.unionByName(data2).sort('match_id') \
    .withColumn("won", F.when(F.col('winner') == F.col('team1'), 1).otherwise(0))

# Select the final columns
data = data_combined.select('match_id', 'flip', 'innings', 'ball', 'curr_score', 'curr_wickets', 'won')

data = data.sort('match_id','flip', 'innings', 'ball')

# Show the resulting DataFrame
data.show(250)

+--------+----+-------+----+----------+------------+---+
|match_id|flip|innings|ball|curr_score|curr_wickets|won|
+--------+----+-------+----+----------+------------+---+
|  211028|   0|      1| 0.1|         0|           0|  1|
|  211028|   0|      1| 0.2|         1|           0|  1|
|  211028|   0|      1| 0.3|         1|           0|  1|
|  211028|   0|      1| 0.4|         1|           0|  1|
|  211028|   0|      1| 0.5|         1|           0|  1|
|  211028|   0|      1| 0.6|         3|           0|  1|
|  211028|   0|      1| 0.7|         5|           0|  1|
|  211028|   0|      1| 1.1|         5|           0|  1|
|  211028|   0|      1| 1.2|         5|           0|  1|
|  211028|   0|      1| 1.3|         7|           0|  1|
|  211028|   0|      1| 1.4|         7|           0|  1|
|  211028|   0|      1| 1.5|         7|           0|  1|
|  211028|   0|      1| 1.6|         7|           0|  1|
|  211028|   0|      1| 1.7|         8|           0|  1|
|  211028|   0|      1| 2.1|   

In [58]:
window_spec = Window.partitionBy("match_id","flip").orderBy("flip", "innings", "ball")
window_spec_ffill = Window.partitionBy("match_id").orderBy("flip", "innings", "ball").rowsBetween(Window.unboundedPreceding, 0)

# Calculate the max of "curr_score" in 1st innings as "target" otherwise forward fill
data = data.withColumn(
    "target",
    F.when(
        (F.col("innings") == 1) & (F.col("curr_score") == F.max("curr_score").over(window_spec)),
        F.col("curr_score")
    ).otherwise(F.lit(None))
)

# Forward fill the "target" column
data = data.withColumn("target", F.last("target", ignorenulls=True).over(window_spec_ffill))
data = data.withColumn("target", F.when(col("innings") == 1, 0).otherwise(col("target")))

data.show(450)

+--------+----+-------+----+----------+------------+---+------+
|match_id|flip|innings|ball|curr_score|curr_wickets|won|target|
+--------+----+-------+----+----------+------------+---+------+
|  211048|   0|      1| 0.1|         2|           0|  0|     0|
|  211048|   0|      1| 0.2|         4|           0|  0|     0|
|  211048|   0|      1| 0.3|         4|           0|  0|     0|
|  211048|   0|      1| 0.4|         5|           0|  0|     0|
|  211048|   0|      1| 0.5|         6|           0|  0|     0|
|  211048|   0|      1| 0.6|        12|           0|  0|     0|
|  211048|   0|      1| 0.7|        12|           1|  0|     0|
|  211048|   0|      1| 1.1|        12|           1|  0|     0|
|  211048|   0|      1| 1.2|        16|           1|  0|     0|
|  211048|   0|      1| 1.3|        22|           1|  0|     0|
|  211048|   0|      1| 1.4|        23|           1|  0|     0|
|  211048|   0|      1| 1.5|        23|           2|  0|     0|
|  211048|   0|      1| 1.6|        24| 

In [50]:
directory = r'D:\github\Cricket-Prediction\data\3_aftermerging'
data.toPandas().to_csv(os.path.join(directory, 'balltoballflip.csv'))