In [2]:
import sys
import os

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


# Then proceed to import and use Spark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CricketPrediction").getOrCreate()


# Specify the directory where your CSV files are located
directory = r'D:\github\Cricket-Prediction\data\2_processeddata'

# Read batting,bowling,fielding CSV files from the specified directory
batting = spark.read.csv(directory + r'\batting.csv', header=True, inferSchema=True)
bowling = spark.read.csv(directory + r'\bowling.csv', header=True, inferSchema=True).drop('Mat','Inns')
fielding = spark.read.csv(directory + r'\fielding.csv', header=True, inferSchema=True).drop('Mat','Inns')
batting.show(5)

+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
|    Player| Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|player_id|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
|A Ahmadhel|Bulgaria|2019/20|            0|             0|             0|            0.0|   0.0| 55a5cffb|
|A Ahmadhel|Bulgaria|   2020|            3|             2|            16|            8.0| 100.0| 55a5cffb|
|A Ahmadhel|Bulgaria|2020/21|            4|             3|            24|            8.0| 100.0| 55a5cffb|
|A Ahmadhel|Bulgaria|   2021|            6|             4|            26|            6.5| 82.14| 55a5cffb|
|A Ahmadhel|Bulgaria|   2023|            9|             7|            31|           4.43| 63.42| 55a5cffb|
+----------+--------+-------+-------------+--------------+--------------+---------------+------+---------+
only showing top 5 rows



In [3]:
# Check for nulls in the data
def check_nulls(df):
    nulls = 0
    for col in df.columns:
        nulls+= df.filter(df[col].isNull()).count()
    print("Total nulls in the dataframe: ", nulls)

check_nulls(batting)
check_nulls(bowling)
check_nulls(fielding)

Total nulls in the dataframe:  0
Total nulls in the dataframe:  0
Total nulls in the dataframe:  0


In [4]:
playerdata = batting.join(bowling, on=['player_id','Player',"Country","Season"], how='inner').join(fielding, on=['player_id','Player',"Country","Season"], how='inner')
playerdata.show(5)

+---------+----------+--------+-------+-------------+--------------+--------------+---------------+------+--------------+---------------+------------------+---------------+---------------+---------------+--------------+---------------+--------------+-------------+-------------+--------------+
|player_id|    Player| Country| Season|Cum Mat Total|Cum Inns Total|Cum Runs Total|Cum Batting Ave|Cum SR|Cumulative Mat|Cumulative Inns|  Cumulative Overs|Cumulative Runs|Cumulative Wkts|Cumulative Econ|Cumulative Mat|Cumulative Inns|Cumulative Dis|Cumulative Ct|Cumulative St|Cumulative D/I|
+---------+----------+--------+-------+-------------+--------------+--------------+---------------+------+--------------+---------------+------------------+---------------+---------------+---------------+--------------+---------------+--------------+-------------+-------------+--------------+
| 55a5cffb|A Ahmadhel|Bulgaria|2019/20|            0|             0|             0|            0.0|   0.0|            

In [5]:
playerdata.toPandas().to_csv(directory + r'\playerStats.csv', index=False)