In [51]:
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("matchPlayersStats").getOrCreate()

directory = r'D:\github\Cricket-Prediction\data\2_processedData'  # for local
# directory = '/app/dataInHandNow/afterpreprocessed'  # for docker

matches = spark.read.csv(os.path.join(directory, 'matches.csv'), header=True, inferSchema=True)
matchPlayers = spark.read.csv(os.path.join(directory, 'Matchplayers.csv'), header=True, inferSchema=True).sort('match_id')
playerStats = spark.read.csv(os.path.join(directory, 'playerStats.csv'), header=True, inferSchema=True)
matchPlayers.show(5)

+---------+------------+---------+------+--------+
|  country|      player|player_id|season|match_id|
+---------+------------+---------+------+--------+
|Australia|AC Gilchrist| 2b6e6dec|  2005|  211028|
|Australia|  RT Ponting| 7d415ea5|  2005|  211028|
|Australia|   ML Hayden| d8699ab7|  2005|  211028|
|Australia|   A Symonds| bd77eb62|  2005|  211028|
|Australia|   MJ Clarke| f842c2cf|  2005|  211028|
+---------+------------+---------+------+--------+
only showing top 5 rows



In [52]:
from pyspark.sql import functions as F

matchPlayers = matchPlayers.withColumn("flip", F.lit(0))
matchPlayers.show(5)

+---------+------------+---------+------+--------+----+
|  country|      player|player_id|season|match_id|flip|
+---------+------------+---------+------+--------+----+
|Australia|AC Gilchrist| 2b6e6dec|  2005|  211028|   0|
|Australia|  RT Ponting| 7d415ea5|  2005|  211028|   0|
|Australia|   ML Hayden| d8699ab7|  2005|  211028|   0|
|Australia|   A Symonds| bd77eb62|  2005|  211028|   0|
|Australia|   MJ Clarke| f842c2cf|  2005|  211028|   0|
+---------+------------+---------+------+--------+----+
only showing top 5 rows



In [53]:
from pyspark.sql import Window
from pyspark.sql.functions import col, lit, row_number

# Step 1: Create a window to assign row numbers within each match_id
window_spec = Window.partitionBy("match_id").orderBy("flip")

# Step 2: Assign row numbers to divide into two teams within each match_id
matchPlayers = matchPlayers.withColumn("row_num", row_number().over(window_spec))

# Step 3: Split data into Team A and Team B based on row number
team_a = matchPlayers.filter(col("row_num") <= 11).withColumn("flip", lit(0))  # Original Team A
team_b = matchPlayers.filter(col("row_num") > 11).withColumn("flip", lit(0))  # Original Team B

# Step 4: Create swapped teams with opposite order
team_b_swapped = team_a.withColumn("flip", lit(1))  # Team B followed by Team A (swapped)
team_a_swapped = team_b.withColumn("flip", lit(1))

# Step 5: Concatenate the original and swapped dataframes
original_teams = team_a.unionByName(team_b).orderBy("country", "player_id")  # Order by country and player_id in the original order
swapped_teams = team_b_swapped.unionByName(team_a_swapped).orderBy("country")  # Order by country and player_id in the swapped order

# Step 6: Combine original and swapped teams, ordering by match_id, flip, and player_id
matchPlayers = original_teams.unionByName(swapped_teams).orderBy(["match_id", "flip", "country"])

# Select the desired columns and display the result
matchPlayers = matchPlayers.select(["match_id", "flip", "player_id", "country", "player", "season"])
matchPlayers.show(44)

+--------+----+---------+---------+--------------+------+
|match_id|flip|player_id|  country|        player|season|
+--------+----+---------+---------+--------------+------+
|  211028|   0| bd77eb62|Australia|     A Symonds|  2005|
|  211028|   0| 2b6e6dec|Australia|  AC Gilchrist|  2005|
|  211028|   0| 69762509|Australia|     DR Martyn|  2005|
|  211028|   0| 74234d66|Australia|  JN Gillespie|  2005|
|  211028|   0| 7d415ea5|Australia|    RT Ponting|  2005|
|  211028|   0| 8d0ea930|Australia| MS Kasprowicz|  2005|
|  211028|   0| 48fd7349|Australia|    MEK Hussey|  2005|
|  211028|   0| d8699ab7|Australia|     ML Hayden|  2005|
|  211028|   0| dd09ff8e|Australia|         B Lee|  2005|
|  211028|   0| ee7d0c82|Australia|    GD McGrath|  2005|
|  211028|   0| f842c2cf|Australia|     MJ Clarke|  2005|
|  211028|   0| 39f01cdb|  England|  KP Pietersen|  2005|
|  211028|   0| a386e91b|  England|PD Collingwood|  2005|
|  211028|   0| 2e929b99|  England|      GO Jones|  2005|
|  211028|   0

In [68]:
# Include row_num in the join
matchPlayersStats = matchPlayers.join(playerStats, on=['player_id','season'], how='inner')
matchPlayersStats = matchPlayersStats.sort("match_id", "flip")

# Display the result starting from the 45th row
matchPlayersStats.offset(44).show(44)

+---------+-------+--------+----+-----------+-------------+-------------+-----------+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+-------------+-------------+--------------+
|player_id| season|match_id|flip|    country|       player|       Player|    Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Mat9|Cumulative Inns10|Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Mat15|Cumulative Inns16|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+-------+--------+----+-----------+-------------+-------------+-----------+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+---------

In [None]:
match_id = matchPlayersStats.groupBy('match_id').count().filter(col('count') == 44).select('match_id')
match_id_list = match_id.collect()
len(match_id_list)

1200

In [80]:
# Extract match_id values from the collected rows
match_id_values = [row.match_id for row in match_id_list]

# Filter matchPlayersStats using the extracted match_id values
matchPlayersStats = matchPlayersStats.filter(col('match_id').isin(match_id_values))
matchPlayersStats.show(5)

+---------+------+--------+----+---------+-------------+-------------+---------+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+-------------+-------------+--------------+
|player_id|season|match_id|flip|  country|       player|       Player|  Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Mat9|Cumulative Inns10|Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Mat15|Cumulative Inns16|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+------+--------+----+---------+-------------+-------------+---------+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+-------------+----------

In [84]:
matchPlayersStats = matchPlayersStats.drop('country','player','player_id','season','Player','Country')
matchPlayersStats.show()

+--------+----+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+-------------+-------------+--------------+
|match_id|flip|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Mat9|Cumulative Inns10|Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Mat15|Cumulative Inns16|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+--------+----+-------------+--------------+--------------+---------------+------+---------------+-----------------+----------------+---------------+---------------+---------------+----------------+-----------------+--------------+-------------+-------------+--------------+
|  211028|   0|            1|             1|             1|            1.0| 33.33|              1|              0.0|             0.0|            0.0|            0.0|          

In [86]:
num_rows = matchPlayersStats.count()
num_cols = len(matchPlayersStats.columns)
(num_rows, num_cols)

(52800, 19)

In [87]:
directory = r'D:\github\Cricket-Prediction\data\3_aftermerging'  # for local
matchPlayersStats.toPandas().to_csv(os.path.join(directory, 'playersStatsflip.csv'))