In [None]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), "..", "..", ".."))
from configs import spark_config as config
from utils import spark_utils as utils

# Specify the directory where your CSV files are located
directory = os.path.join(config.RAW_DATA_DIR, "t20s_csv2")
print(directory)
client = utils.get_hdfs_client()

all_files = client.list(directory)
info_files = [os.path.join(directory, file) for file in all_files if "info" in file]
delivery_files = [
    os.path.join(directory, file) for file in all_files if "info" not in file
]

matches = []
deliveries = []
# Print the list of CSV files
for info_file in info_files:
    matches.append(info_file.split("\\")[-1])
for delivery in delivery_files:
    if "_info" not in delivery:
        deliveries.append(delivery.split("\\")[-1])

/usr/ravi/t20/data/1_rawData/t20s_csv2
[[34m2024-11-24T13:51:33.734+0530[0m] {[34mbase.py:[0m84} INFO[0m - Retrieving connection 'webhdfs_default'[0m
[[34m2024-11-24T13:51:33.737+0530[0m] {[34mwebhdfs.py:[0m82} INFO[0m - Trying to connect to 192.168.245.142:9870[0m
[[34m2024-11-24T13:51:33.739+0530[0m] {[34mwebhdfs.py:[0m86} INFO[0m - Trying namenode 192.168.245.142[0m
[[34m2024-11-24T13:51:33.741+0530[0m] {[34mclient.py:[0m192} INFO[0m - Instantiated <InsecureClient(url='http://192.168.245.142:9870/')>.[0m
[[34m2024-11-24T13:51:33.744+0530[0m] {[34mclient.py:[0m320} INFO[0m - Fetching status for '/'.[0m
[[34m2024-11-24T13:51:33.763+0530[0m] {[34mwebhdfs.py:[0m96} INFO[0m - Using namenode 192.168.245.142 for hook[0m
[[34m2024-11-24T13:51:33.765+0530[0m] {[34mclient.py:[0m1116} INFO[0m - Listing '/usr/ravi/t20/data/1_rawData/t20s_csv2'.[0m


In [2]:
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    FloatType,
)

spark = utils.create_spark_session(
    "deliveries",
    {
        "spark.executor.memory": "5g",
        "spark.executor.cores": "6",
    },
)


# Define the schema for the deliveries data
delivery_schema = StructType(
    [
        StructField("match_id", IntegerType(), True),
        StructField("season", StringType(), True),
        StructField("start_date", StringType(), True),
        StructField("venue", StringType(), True),
        StructField("innings", IntegerType(), True),
        StructField("ball", FloatType(), True),
        StructField("batting_team", StringType(), True),
        StructField("bowling_team", StringType(), True),
        StructField("striker", StringType(), True),
        StructField("non_striker", StringType(), True),
        StructField("bowler", StringType(), True),
        StructField("runs_off_bat", IntegerType(), True),
        StructField("extras", IntegerType(), True),
        StructField("wides", IntegerType(), True),
        StructField("noballs", StringType(), True),
        StructField("byes", IntegerType(), True),
        StructField("legbyes", IntegerType(), True),
        StructField("penalty", StringType(), True),
        StructField("wicket_type", StringType(), True),
        StructField("player_dismissed", StringType(), True),
        StructField("other_wicket_type", StringType(), True),
        StructField("other_player_dismissed", StringType(), True),
    ]
)

# Initialize an empty DataFrame with the schema
deliveries_data = spark.read.csv(delivery_files, header=True, schema=delivery_schema)
deliveries_data.show(5)

[[34m2024-11-24T13:51:33.970+0530[0m] {[34mspark_utils.py:[0m17} INFO[0m - Creating Spark session.[0m


your 131072x1 screen size is bogus. expect trouble
24/11/24 13:51:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/24 13:51:41 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


[[34m2024-11-24T13:51:42.352+0530[0m] {[34mspark_utils.py:[0m37} INFO[0m - Spark session created successfully.[0m


[Stage 1:>                                                          (0 + 1) / 1]

+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id| season|start_date|               venue|innings|ball|batting_team|bowling_team|      striker|  non_striker|     bowler|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|2021/22|2022-03-22|Al Amerat Cricket...|      1| 0.1|     Bahrain|Saudi Arabia|TI Gajanayake|    R Rodrigo|C Sewsunker|           0|     0| null|   null|null|   null|   null|       null|            null|             null|    

                                                                                

In [3]:
from pyspark.sql.functions import col, sum

# Count the number of null values in each column
null_counts = deliveries_data.select([sum(col(c).isNull().cast("int")).alias(c) for c in deliveries_data.columns])
null_counts.show()



+--------+------+----------+-----+-------+----+------------+------------+-------+-----------+------+------------+------+------+-------+------+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id|season|start_date|venue|innings|ball|batting_team|bowling_team|striker|non_striker|bowler|runs_off_bat|extras| wides|noballs|  byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+------+----------+-----+-------+----+------------+------------+-------+-----------+------+------------+------+------+-------+------+-------+-------+-----------+----------------+-----------------+----------------------+
|       0|     0|         0|    0|      0|   0|           0|           0|      0|          0|     0|           0|     0|829104| 860895|861878| 854592| 865627|     817721|          817721|           865640|                865640|
+--------+------+----------+-----+-------+----+------------+------------+-------+---

                                                                                

In [4]:
deliveries_data = deliveries_data.fillna(0)
deliveries_data.show(5)

+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id| season|start_date|               venue|innings|ball|batting_team|bowling_team|      striker|  non_striker|     bowler|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|2021/22|2022-03-22|Al Amerat Cricket...|      1| 0.1|     Bahrain|Saudi Arabia|TI Gajanayake|    R Rodrigo|C Sewsunker|           0|     0|    0|   null|   0|      0|   null|       null|            null|             null|    

In [5]:
null_counts = deliveries_data.select([sum(col(c).isNull().cast("int")).alias(c) for c in deliveries_data.columns])
null_counts.show()



+--------+------+----------+-----+-------+----+------------+------------+-------+-----------+------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id|season|start_date|venue|innings|ball|batting_team|bowling_team|striker|non_striker|bowler|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+------+----------+-----+-------+----+------------+------------+-------+-----------+------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|       0|     0|         0|    0|      0|   0|           0|           0|      0|          0|     0|           0|     0|    0| 860895|   0|      0| 865627|     817721|          817721|           865640|                865640|
+--------+------+----------+-----+-------+----+------------+------------+-------+-----------+---

                                                                                

In [6]:
deliveries_data.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- innings: integer (nullable = true)
 |-- ball: float (nullable = false)
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- striker: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- runs_off_bat: integer (nullable = true)
 |-- extras: integer (nullable = true)
 |-- wides: integer (nullable = true)
 |-- noballs: string (nullable = true)
 |-- byes: integer (nullable = true)
 |-- legbyes: integer (nullable = true)
 |-- penalty: string (nullable = true)
 |-- wicket_type: string (nullable = true)
 |-- player_dismissed: string (nullable = true)
 |-- other_wicket_type: string (nullable = true)
 |-- other_player_dismissed: string (nullable = true)



In [7]:
from pyspark.sql.functions import when

deliveries_data = deliveries_data.withColumn('noballs', when(col('noballs').isNull(), '0').otherwise(col('noballs')).cast(IntegerType()))
deliveries_data = deliveries_data.withColumn('penalty', when(col('penalty').isNull(), '0').otherwise(col('penalty')).cast(IntegerType()))
deliveries_data.show(5)

+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id| season|start_date|               venue|innings|ball|batting_team|bowling_team|      striker|  non_striker|     bowler|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|2021/22|2022-03-22|Al Amerat Cricket...|      1| 0.1|     Bahrain|Saudi Arabia|TI Gajanayake|    R Rodrigo|C Sewsunker|           0|     0|    0|      0|   0|      0|      0|       null|            null|             null|    

In [8]:
from pyspark.sql.functions import when
columns = ['wicket_type','player_dismissed','other_wicket_type','other_player_dismissed']
for column in columns:
    deliveries_data = deliveries_data.withColumn(column, when(col(column).isNull(), '0').otherwise('1').cast(IntegerType()))

deliveries_data.show()

+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id| season|start_date|               venue|innings|ball|batting_team|bowling_team|      striker|  non_striker|     bowler|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+----------+--------------------+-------+----+------------+------------+-------------+-------------+-----------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|2021/22|2022-03-22|Al Amerat Cricket...|      1| 0.1|     Bahrain|Saudi Arabia|TI Gajanayake|    R Rodrigo|C Sewsunker|           0|     0|    0|      0|   0|      0|      0|          0|               0|                0|    

In [9]:
deliveries_data = deliveries_data.select('match_id','season','innings','ball','batting_team','bowling_team','runs_off_bat','extras','wides','noballs','byes','legbyes','penalty','wicket_type','player_dismissed','other_wicket_type','other_player_dismissed')
deliveries_data.show(5)

+--------+-------+-------+----+------------+------------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
|match_id| season|innings|ball|batting_team|bowling_team|runs_off_bat|extras|wides|noballs|byes|legbyes|penalty|wicket_type|player_dismissed|other_wicket_type|other_player_dismissed|
+--------+-------+-------+----+------------+------------+------------+------+-----+-------+----+-------+-------+-----------+----------------+-----------------+----------------------+
| 1306389|2021/22|      1| 0.1|     Bahrain|Saudi Arabia|           0|     0|    0|      0|   0|      0|      0|          0|               0|                0|                     0|
| 1306389|2021/22|      1| 0.2|     Bahrain|Saudi Arabia|           0|     1|    1|      0|   0|      0|      0|          0|               0|                0|                     0|
| 1306389|2021/22|      1| 0.3|     Bahrain|Saudi Arabia|           0|     1|    1|  

In [10]:
# Save the DataFrame to a CSV file
deliveries_data.write.csv(config.PROCESSED_DATA_DIR + 'deliveries.csv', header=True, mode='overwrite')
spark.stop()

                                                                                