In [20]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..','..','..','..'))
from configs import spark_config as config
from utils import spark_utils as utils

# Create a Spark session
spark = utils.create_spark_session("playerStats", {
    'spark.executor.memory': '4g',
    'spark.executor.cores': '6',
})

fielding_data = utils.load_data(spark,config.PROCESSED_DATA_DIR, 'fielding_data.csv')
bowling_data = utils.load_data(spark,config.PROCESSED_DATA_DIR, 'bowling_data.csv')
batting_data = utils.load_data(spark,config.PROCESSED_DATA_DIR, 'batting_data.csv')

fielding_data.show(5)

[[34m2024-11-24T14:27:32.333+0530[0m] {[34mspark_utils.py:[0m17} INFO[0m - Creating Spark session.[0m
[[34m2024-11-24T14:27:32.470+0530[0m] {[34mspark_utils.py:[0m37} INFO[0m - Spark session created successfully.[0m
[[34m2024-11-24T14:27:32.472+0530[0m] {[34mspark_utils.py:[0m46} INFO[0m - Loading data from fielding_data.csv.[0m


24/11/24 14:27:32 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
[Stage 1:>                                                          (0 + 1) / 1]

[[34m2024-11-24T14:27:38.093+0530[0m] {[34mspark_utils.py:[0m46} INFO[0m - Loading data from bowling_data.csv.[0m


                                                                                

[[34m2024-11-24T14:27:38.568+0530[0m] {[34mspark_utils.py:[0m46} INFO[0m - Loading data from batting_data.csv.[0m
+---------+------------+---------+-------+--------------+---------------+--------------+-------------+-------------+--------------+
|player_id|      Player|  Country| Season|Cumulative Mat|Cumulative Inns|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+------------+---------+-------+--------------+---------------+--------------+-------------+-------------+--------------+
| b8d490fd|    AJ Finch|Australia|2016/17|            28|           28.0|           7.0|          7.0|          0.0|          0.25|
| b970a03f|   M Klinger|Australia|2016/17|             0|            0.0|           0.0|          0.0|          0.0|           0.0|
| 12b610c2|     TM Head|Australia|2016/17|             4|            4.0|           2.0|          2.0|          0.0|           0.5|
| 32198ae0|MC Henriques|Australia|2016/17|             6|            6.0|           2.0|

In [21]:
# Check for nulls in the data
def check_nulls(df):
    nulls = 0
    for col in df.columns:
        nulls+= df.filter(df[col].isNull()).count()
    print("Total nulls in the dataframe: ", nulls)

check_nulls(batting_data)
check_nulls(bowling_data)
check_nulls(fielding_data)

Total nulls in the dataframe:  0
Total nulls in the dataframe:  0
Total nulls in the dataframe:  0


In [22]:
print(batting_data.columns,batting_data.count())
print(bowling_data.columns,bowling_data.count())
print(fielding_data.columns,fielding_data.count())

['player_id', 'Player', 'Country', 'Season', 'Cum Mat Total', 'Cum Runs Total', 'Cum SR'] 51324
['player_id', 'Player', 'Country', 'Season', 'Cumulative Mat', 'Cumulative Inns', 'Cumulative Overs', 'Cumulative Mdns', 'Cumulative Bowling Runs', 'Cumulative Wkts', 'Cumulative Econ'] 51324
['player_id', 'Player', 'Country', 'Season', 'Cumulative Mat', 'Cumulative Inns', 'Cumulative Dis', 'Cumulative Ct', 'Cumulative St', 'Cumulative D/I'] 51324


In [23]:
playerdata = batting_data.join(bowling_data, on=['Player',"Country","Season"], how='inner').join(fielding_data, on=['Player',"Country","Season"], how='inner')\
    .drop('Cumulative Mat','Cumulative Inns')
playerdata = playerdata.dropDuplicates([ 'Player', 'Country', 'Season', 'player_id']).drop('player_id')
print(playerdata.count())
playerdata.show(5)

                                                                                

12103


[Stage 110:>                                                        (0 + 1) / 1]

+---------------+----------------+-------+-------------+--------------+------+----------------+---------------+-----------------------+---------------+---------------+--------------+-------------+-------------+--------------+
|         Player|         Country| Season|Cum Mat Total|Cum Runs Total|Cum SR|Cumulative Overs|Cumulative Mdns|Cumulative Bowling Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------------+----------------+-------+-------------+--------------+------+----------------+---------------+-----------------------+---------------+---------------+--------------+-------------+-------------+--------------+
| AB de Villiers|    South Africa|   2017|           73|          1457|128.93|             0.0|            0.0|                    0.0|            0.0|            0.0|          68.0|         61.0|          7.0|          0.93|
|    TA Blundell|     New Zealand|2017/18|            1|             0|   0.0|             0.0| 

                                                                                

In [24]:
playerdata.sort(["Player","Country","Season"]).show()

[Stage 115:>                                                        (0 + 1) / 1]

+-----------+--------------+-------+-------------+--------------+------+------------------+---------------+-----------------------+---------------+---------------+--------------+-------------+-------------+--------------+
|     Player|       Country| Season|Cum Mat Total|Cum Runs Total|Cum SR|  Cumulative Overs|Cumulative Mdns|Cumulative Bowling Runs|Cumulative Wkts|Cumulative Econ|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+-----------+--------------+-------+-------------+--------------+------+------------------+---------------+-----------------------+---------------+---------------+--------------+-------------+-------------+--------------+
| A Ahmadhel|      Bulgaria|2020/21|            4|            24| 100.0|              12.0|            0.0|                   97.0|            4.0|           8.38|           0.0|          0.0|          0.0|           0.0|
| A Ahmadhel|      Bulgaria|   2021|            6|            26| 82.14|14.400000095367432|            0.0|     

                                                                                

In [29]:
utils.spark_save_data(playerdata,config.PROCESSED_DATA_DIR, 'player_stats.csv')
spark.stop()

[Stage 140:>                                                        (0 + 1) / 1]

[[34m2024-11-24T14:30:47.761+0530[0m] {[34mspark_utils.py:[0m64} INFO[0m - Successfully wrote data to /usr/ravi/t20/data/2_processedData/player_stats.csv[0m


                                                                                