In [28]:
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('t20').config('spark.executer.memory', '4g').getOrCreate()

directory = '/usr/ravi/t20/data/2_processedData'  # for local
# directory = '/app/dataInHandNow/afterpreprocessed'  # for docker

matches = spark.read.csv(os.path.join(directory, 'matches.csv'), header=True, inferSchema=True)
matchPlayers = spark.read.csv(os.path.join(directory, 'match_players.csv'), header=True, inferSchema=True).sort('match_id')
playerStats = spark.read.csv(os.path.join(directory, 'player_stats.csv'), header=True, inferSchema=True)
playerStats.show(5)

+---------+-------+------------+--------------------+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
|player_id| Season|      Player|             Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR| Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+-------+------------+--------------------+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
| c594137f|   2024|    A Shukla|             Croatia|            0|             0|             0|            0.0|   0.0|              0.0|            0.0|            0.0|            0.0|           0.0|          0.0|          0.0|           0.0|
| 074acfb4|2023/24| 

In [29]:
from pyspark.sql import functions as F

matchPlayers = matchPlayers.withColumn("flip", F.lit(0))
matchPlayers.show(5)

+---------+------------+---------+------+--------+----+
|  country|      player|player_id|season|match_id|flip|
+---------+------------+---------+------+--------+----+
|Australia|AC Gilchrist| 2b6e6dec|  2005|  211028|   0|
|Australia|  RT Ponting| 7d415ea5|  2005|  211028|   0|
|Australia|   ML Hayden| d8699ab7|  2005|  211028|   0|
|Australia|   A Symonds| bd77eb62|  2005|  211028|   0|
|Australia|   MJ Clarke| f842c2cf|  2005|  211028|   0|
+---------+------------+---------+------+--------+----+
only showing top 5 rows



In [30]:
from pyspark.sql import Window
from pyspark.sql.functions import col, lit, row_number

# Step 1: Create a window to assign row numbers within each match_id
window_spec = Window.partitionBy("match_id").orderBy("flip")

# Step 2: Assign row numbers to divide into two teams within each match_id
matchPlayers = matchPlayers.withColumn("row_num", row_number().over(window_spec))

# Step 3: Split data into Team A and Team B based on row number
team_a = matchPlayers.filter(col("row_num") <= 11).withColumn("flip", lit(0))  # Original Team A
team_b = matchPlayers.filter(col("row_num") > 11).withColumn("flip", lit(0))  # Original Team B

# Step 4: Create swapped teams with opposite order
team_b_swapped = team_a.withColumn("flip", lit(1))  # Team B followed by Team A (swapped)
team_a_swapped = team_b.withColumn("flip", lit(1))

# Step 5: Concatenate the original and swapped dataframes
original_teams = team_a.unionByName(team_b).orderBy("country", "player_id")  # Order by country and player_id in the original order
swapped_teams = team_b_swapped.unionByName(team_a_swapped).orderBy("country")  # Order by country and player_id in the swapped order

# Step 6: Combine original and swapped teams, ordering by match_id, flip, and player_id
matchPlayers = original_teams.unionByName(swapped_teams).orderBy(["match_id", "flip", "country"])

# Select the desired columns and display the result
matchPlayers = matchPlayers.select(["match_id", "flip", "player_id", "country", "player", "season"])
matchPlayers.show(44)

                                                                                

+--------+----+---------+---------+--------------+------+
|match_id|flip|player_id|  country|        player|season|
+--------+----+---------+---------+--------------+------+
|  211028|   0| 69762509|Australia|     DR Martyn|  2005|
|  211028|   0| f842c2cf|Australia|     MJ Clarke|  2005|
|  211028|   0| 7d415ea5|Australia|    RT Ponting|  2005|
|  211028|   0| 48fd7349|Australia|    MEK Hussey|  2005|
|  211028|   0| ee7d0c82|Australia|    GD McGrath|  2005|
|  211028|   0| 2b6e6dec|Australia|  AC Gilchrist|  2005|
|  211028|   0| 8d0ea930|Australia| MS Kasprowicz|  2005|
|  211028|   0| d8699ab7|Australia|     ML Hayden|  2005|
|  211028|   0| dd09ff8e|Australia|         B Lee|  2005|
|  211028|   0| 74234d66|Australia|  JN Gillespie|  2005|
|  211028|   0| bd77eb62|Australia|     A Symonds|  2005|
|  211028|   0| b68d14a9|  England|    AJ Strauss|  2005|
|  211028|   0| c16d2e28|  England|   SJ Harmison|  2005|
|  211028|   0| ea42ddb9|  England|ME Trescothick|  2005|
|  211028|   0

In [31]:
playerStats.show(5)

+---------+-------+------------+--------------------+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
|player_id| Season|      Player|             Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR| Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+-------+------------+--------------------+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
| c594137f|   2024|    A Shukla|             Croatia|            0|             0|             0|            0.0|   0.0|              0.0|            0.0|            0.0|            0.0|           0.0|          0.0|          0.0|           0.0|
| 074acfb4|2023/24| 

In [32]:
# Include row_num in the join
matchPlayersStats = matchPlayers.join(playerStats, on=['player_id','season'], how='inner')
matchPlayersStats = matchPlayersStats.sort("match_id", "flip")

# Display the result starting from the 45th row
matchPlayersStats.show(44)



+---------+-------+--------+----+-----------+--------------+--------------+-----------+-------------+--------------+--------------+---------------+------+----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
|player_id| season|match_id|flip|    country|        player|        Player|    Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+-------+--------+----+-----------+--------------+--------------+-----------+-------------+--------------+--------------+---------------+------+----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
| fcbf5a30|   2005|  211028|   0|    England|       D Gough|       D Gough|    England|            0|             0|             0|            0.0|   0.0|            

                                                                                

In [33]:
match_id = matchPlayersStats.groupBy('match_id').count().filter(col('count') == 44).select('match_id')
match_id_list = match_id.collect()
len(match_id_list)

                                                                                

18

In [34]:
# Extract match_id values from the collected rows
match_id_values = [row.match_id for row in match_id_list]

# Filter matchPlayersStats using the extracted match_id values
matchPlayersStats = matchPlayersStats.filter(col('match_id').isin(match_id_values))
matchPlayersStats.show(5)

24/11/14 17:04:47 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /192.168.245.142:33504 is closed
24/11/14 17:04:47 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 194 from block manager BlockManagerId(4, 192.168.245.142, 38147, None)
java.io.IOException: Connection from /192.168.245.142:33504 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundH

+---------+-------+--------+----+-----------+----------------+----------------+-----------+-------------+--------------+--------------+---------------+------+----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
|player_id| season|match_id|flip|    country|          player|          Player|    Country|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+-------+--------+----+-----------+----------------+----------------+-----------+-------------+--------------+--------------+---------------+------+----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
| 634a7b21|2016/17| 1074959|   0|Netherlands|       BN Cooper|       BN Cooper|Netherlands|           23|            20|           365|          22.84|111

                                                                                

In [35]:
matchPlayersStats = matchPlayersStats.drop('country','player','player_id','season','Player','Country')
matchPlayersStats.show()

24/11/14 17:05:43 WARN TransportChannelHandler: Exception in connection from /192.168.245.142:54574
java.io.IOException: Connection reset by peer
	at java.base/sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:276)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:233)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:223)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:356)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:258)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.buffer.WrappedByteBuf.writeBytes(WrappedByteBuf.java:821)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEv

+--------+----+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
|match_id|flip|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR| Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+--------+----+-------------+--------------+--------------+---------------+------+-----------------+---------------+---------------+---------------+--------------+-------------+-------------+--------------+
| 1074959|   0|            2|             2|            13|            6.5|138.88|              7.0|           53.0|            1.0|            8.0|           1.0|          1.0|          0.0|           0.5|
| 1074959|   0|            0|             0|             0|            0.0|   0.0|              0.0|            0.0|            0.0|            0.0|           0.0|         

In [36]:
num_rows = matchPlayersStats.count()
num_cols = len(matchPlayersStats.columns)
(num_rows, num_cols)

                                                                                

(792, 15)

In [37]:
directory = r'D:\github\Cricket-Prediction\data\3_aftermerging'  # for local
matchPlayersStats.toPandas().to_csv(os.path.join(directory, 'playersStatsflip.csv'))

                                                                                

Py4JError: An error occurred while calling o588.pandasStructHandlingMode. Trace:
py4j.Py4JException: Method pandasStructHandlingMode([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)

