In [79]:
# IPL Data Analysis
"""
1. findspark: Utility to locate and initialize Spark installation
2. SparkSession: Entry point for DataFrame and SQL functionality
3. Window: For window aggregation operations
4. sql.functions (F): Collection of built-in functions
- SparkSession initialization with local mode
- All cores utilization with local[*]
- Application name setting for monitoring
"""

import findspark
findspark.init()  # This automatically finds Spark installation(local spark)

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IPL Data Analysis") \
    .master("local[*]") \
    .getOrCreate()

print("Spark Session Created Successfully!")

Spark Session Created Successfully!


In [80]:
# Load the CSV file
"""
1. spark.read: DataFrame reader interface
2. option(): Configures CSV reading options
3. show(): Display DataFrame content

- Loads deliveries.csv with header recognition
- Initial preview of first 5 rows to verify data loading
- No schema enforcement at this stage (auto-detection)

- Headers are read from the first row
- All columns are initially inferred as strings
- Data preview helps in understanding the structure
"""

deliveries_df = spark.read.option('header', 'true').csv('data/deliveries.csv')
print("\n--- Initial Data Preview ---")
deliveries_df.show(5)


--- Initial Data Preview ---
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|        batting_team|        bowling_team|over|ball|     batter| bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1| SC Ganguly|P Kumar|BB McCullum|           0|         1|         1|    legbyes|        0|              NA|            NA|     NA|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|BB McCullum|P Kumar| SC Ganguly|           0|         0|         0|       NULL|        0|              NA

In [81]:
# Check distinct match IDs
"""
1. select(): Column selection
2. distinct(): Remove duplicates
3. sort(): Order results
4. F.col(): Column reference for operations

- Identifies all unique matches in the dataset
- Sorts match IDs in descending order
- Helps understand the tournament coverage

- Shows total number of matches
- Helps identify specific matches for analysis
- Useful for data validation and scope definition
"""

print("\n--- Distinct Match IDs (Sorted Descending) ---")
deliveries_df.select('match_id').distinct().sort(F.col("match_id").desc()).show()


--- Distinct Match IDs (Sorted Descending) ---
+--------+
|match_id|
+--------+
|  981019|
|  981017|
|  981015|
|  981013|
|  981011|
|  981009|
|  981007|
|  981005|
|  981003|
|  981001|
|  980999|
|  980997|
|  980995|
|  980993|
|  980991|
|  980989|
|  980987|
|  980985|
|  980983|
|  980981|
+--------+
only showing top 20 rows

+--------+
|match_id|
+--------+
|  981019|
|  981017|
|  981015|
|  981013|
|  981011|
|  981009|
|  981007|
|  981005|
|  981003|
|  981001|
|  980999|
|  980997|
|  980995|
|  980993|
|  980991|
|  980989|
|  980987|
|  980985|
|  980983|
|  980981|
+--------+
only showing top 20 rows



In [82]:
# Check initial schema
"""
1. printSchema(): Displays DataFrame structure
2. Data type inspection
3. Column structure analysis

- Shows the automatically inferred schema
- Helps identify data type issues
- Guides schema optimization needs
- All columns initially loaded as strings
- Need for proper data type conversion identified
- Foundation for schema optimization
"""

print("\n--- Initial Schema ---")
deliveries_df.printSchema()


--- Initial Schema ---
root
 |-- match_id: string (nullable = true)
 |-- inning: string (nullable = true)
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- over: string (nullable = true)
 |-- ball: string (nullable = true)
 |-- batter: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- batsman_runs: string (nullable = true)
 |-- extra_runs: string (nullable = true)
 |-- total_runs: string (nullable = true)
 |-- extras_type: string (nullable = true)
 |-- is_wicket: string (nullable = true)
 |-- player_dismissed: string (nullable = true)
 |-- dismissal_kind: string (nullable = true)
 |-- fielder: string (nullable = true)



In [83]:
# Define proper schema with correct data types
"""
1. StructField: Defines column properties
2. IntegerType/StringType: Data type definitions
3. List comprehension for schema creation
4. StructType: Schema container
- Identifies numeric columns
- Creates optimized schema structure
- Reloads data with proper types
- Numeric columns converted to IntegerType
- Other columns remain as StringType
- Improves query performance and memory usage
- Enables proper numeric operations
"""

int_col = ['match_id', 'inning', 'over', 'ball', 'batsman_runs',
           'extra_runs', 'total_runs', 'is_wicket']

fields = [StructField(col, IntegerType(), nullable=True) if col in int_col  
          else StructField(col, StringType(), nullable=True) for col in deliveries_df.columns]

# Reload with proper schema
deliveries_df = spark.read.option('header', 'true').schema(StructType(fields)).csv('data/deliveries.csv')
print("\n--- Updated Schema ---")
deliveries_df.printSchema()


--- Updated Schema ---
root
 |-- match_id: integer (nullable = true)
 |-- inning: integer (nullable = true)
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- over: integer (nullable = true)
 |-- ball: integer (nullable = true)
 |-- batter: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- batsman_runs: integer (nullable = true)
 |-- extra_runs: integer (nullable = true)
 |-- total_runs: integer (nullable = true)
 |-- extras_type: string (nullable = true)
 |-- is_wicket: integer (nullable = true)
 |-- player_dismissed: string (nullable = true)
 |-- dismissal_kind: string (nullable = true)
 |-- fielder: string (nullable = true)



In [84]:
# Filter for IPL Final 2024 (match_id = 1426312)
"""
1. filter(): SQL-like data filtering
2. show(): Display results
3. Boolean condition evaluation

- Isolates specific match data (Final match)
- Creates focused dataset for detailed analysis
- First step in match-specific analysis
- Uses match_id for precise filtering
- Reduces data volume for analysis
- Prepares for innings-level analysis
"""

ipl_final_df = deliveries_df.filter('match_id == 1426312')
print("\n--- IPL Final Match Data ---")
ipl_final_df.show(10)


--- IPL Final Match Data ---
+--------+------+-------------------+--------------------+----+----+---------------+--------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|       batting_team|        bowling_team|over|ball|         batter|  bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+-------------------+--------------------+----+----+---------------+--------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
| 1426312|     1|Sunrisers Hyderabad|Kolkata Knight Ri...|   0|   1|Abhishek Sharma|MA Starc|    TM Head|           0|         0|         0|       NULL|        0|              NA|            NA|     NA|
| 1426312|     1|Sunrisers Hyderabad|Kolkata Knight Ri...|   0|   2|Abhishek Sharma|MA Starc|    TM Head|           0|         0|         0|       NULL|      

In [85]:
# First Innings Analysis
"""
1. filter(): Innings-level filtering
2. show(): Data preview
3. Boolean condition for innings selection
- Isolates first innings data
- Creates foundation for batting/bowling analysis
- Prepares for detailed scorecard creation
- Focuses on first innings (inning == 1)
- Sets up for detailed statistical analysis
- Important for match progression analysis
"""

first_innings_batting = ipl_final_df.filter('inning == 1')
print("\n--- First Innings Data ---")
first_innings_batting.show(10)


--- First Innings Data ---
+--------+------+-------------------+--------------------+----+----+---------------+--------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|       batting_team|        bowling_team|over|ball|         batter|  bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+-------------------+--------------------+----+----+---------------+--------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
| 1426312|     1|Sunrisers Hyderabad|Kolkata Knight Ri...|   0|   1|Abhishek Sharma|MA Starc|    TM Head|           0|         0|         0|       NULL|        0|              NA|            NA|     NA|
| 1426312|     1|Sunrisers Hyderabad|Kolkata Knight Ri...|   0|   2|Abhishek Sharma|MA Starc|    TM Head|           0|         0|         0|       NULL|        

In [86]:
# BATTING SCORECARD
"""
1. groupBy(): Aggregation by batsman
2. agg(): Multiple aggregation functions
3. when(): Conditional counting
4. withColumn(): Column creation
5. Window functions: Determining batting order
6. join(): Combining statistics with order
- Calculates comprehensive batting statistics:
  * Total runs scored
  * Balls faced
  * Boundaries (4s and 6s)
  * Strike rate
- Determines correct batting order
- Creates complete batting scorecard

1. Basic Statistics:
   - Runs, balls, boundaries
   - Strike rate calculation
2. Batting Order:
   - Uses over.ball for chronological order
   - Window function for sequential numbering
3. Final Scorecard:
   - Joins stats with batting order
   - Orders by batting position
"""

scorecard_df = first_innings_batting.filter("extras_type is NULL").groupBy('batter').agg(
    F.sum('batsman_runs').alias('runs'),
    F.count('ball').alias('balls'),
    F.count(F.when(first_innings_batting.batsman_runs == 4, 1)).alias('4s'),
    F.count(F.when(first_innings_batting.batsman_runs == 6, 1)).alias('6s'),
    F.round(F.sum('batsman_runs') * 100 / F.count('ball'), 2).alias('S/R')
)

# Get batting order
batsman_order = first_innings_batting.withColumn(
    'over-ball', 
    (F.concat(F.col("over"), F.lit("."), F.col("ball"))).cast(FloatType())
).groupBy("batter").agg(
    F.min("over-ball").alias("order")
).orderBy("order")

batting_order_df = batsman_order.withColumn(
    "batting_order", 
    F.row_number().over(Window.orderBy("order"))
)

# Join batting stats with batting order
batting_scorecard_final = scorecard_df.join(
    batting_order_df, 
    on=['batter'], 
    how='inner'
).select('batting_order', 'batter', 'runs', 'balls', '4s', '6s', 'S/R').orderBy('batting_order')

print("\n--- BATTING SCORECARD (First Innings) ---")
batting_scorecard_final.show()


--- BATTING SCORECARD (First Innings) ---
+-------------+-------------------+----+-----+---+---+------+
|batting_order|             batter|runs|balls| 4s| 6s|   S/R|
+-------------+-------------------+----+-----+---+---+------+
|            1|    Abhishek Sharma|   2|    5|  0|  0|  40.0|
|            2|        RA Tripathi|   9|   12|  1|  0|  75.0|
|            3|            TM Head|   0|    1|  0|  0|   0.0|
|            4|         AK Markram|  20|   23|  3|  0| 86.96|
|            5|Nithish Kumar Reddy|  13|   10|  1|  1| 130.0|
|            6|          H Klaasen|  16|   17|  1|  0| 94.12|
|            7|      Shahbaz Ahmed|   8|    7|  0|  1|114.29|
|            8|        Abdul Samad|   4|    4|  0|  0| 100.0|
|            9|         PJ Cummins|  24|   17|  2|  1|141.18|
|           10|         JD Unadkat|   4|   11|  0|  0| 36.36|
|           11|            B Kumar|   0|    1|  0|  0|   0.0|
+-------------+-------------------+----+-----+---+---+------+

+-------------+-----------

In [87]:
# BOWLING SCORECARD
"""
1. groupBy(): Aggregation by bowler
2. Complex when conditions: Handling extras
3. Mathematical operations: Overs calculation
4. Column manipulation and renaming

- Calculates detailed bowling statistics:
  * Runs conceded (excluding extras)
  * Overs bowled
  * Wickets taken
  * Economy rate
- Handles special cases (extras)
1. Runs Analysis:
   - Total runs conceded
   - Adjustment for extras
2. Over Calculation:
   - Converts balls to overs
   - Proper format (overs.balls)
3. Economy Calculation:
   - Runs per over
   - Rounded to 2 decimals
"""

scorecard_bowler_df = first_innings_batting.groupBy('bowler').agg(
    F.sum('total_runs').alias('runs_conceded'),
    F.sum((F.when((F.col("extras_type") == "legbyes") | (F.col("extras_type") == "byes"), 
                  F.col("extra_runs")))).alias('not_by_bowler'),
    F.count(F.when((F.col("extras_type").isNull()) | (F.col("extras_type") == "legbyes") | 
                   (F.col("extras_type") == "byes"), 1)).alias('balls'), 
    F.count(F.when(F.col("is_wicket") == 1, 1)).alias('W')
)

# Format bowling figures
scorecard_bowler_df = scorecard_bowler_df.select(
    F.col('bowler'),
    F.concat(F.floor(F.col('balls') / 6), F.lit("."), (F.col('balls') % 6)).alias('O'), 
    (F.col('runs_conceded') - F.coalesce(F.col('not_by_bowler'), F.lit(0))).alias('R'),
    F.col('W'),
    F.round((F.col('runs_conceded') / (F.col('balls') / 6)), 2).alias('Econ')
)

In [88]:
# MAIDEN OVERS CALCULATION
"""
1. Multiple groupBy operations
2. Complex filtering conditions
3. Chained DataFrame operations
4. Left join for complete stats

- Identifies maiden overs (0 runs in 6 balls)
- Accounts for extras correctly
- Integrates with bowling figures

1. Over-level Analysis:
   - Groups by bowler and over
   - Counts balls and runs
2. Maiden Over Criteria:
   - 6 balls bowled
   - No runs from bat
   - Handles extras properly
3. Final Integration:
   - Joins with main bowling stats
   - Completes bowling scorecard
"""

maiden_bowler_df = first_innings_batting.groupBy('bowler', 'over').agg(
    F.sum('total_runs').alias('runs_conceded'),
    F.count(F.col('over')).alias('balls'),
    F.sum((F.when((F.col("extras_type") == "legbyes") | (F.col("extras_type") == "byes"), 
                  F.col("extra_runs")))).alias('not_by_bowler')
)

maiden_bowler_df = maiden_bowler_df.withColumn(
    'runs_by_bowler', 
    F.col('runs_conceded') - F.coalesce(F.col('not_by_bowler'), F.lit(0))
)

# Filter maiden overs (0 runs in 6 balls)
maiden_bowler_df = maiden_bowler_df.filter(
    (F.col('runs_by_bowler') == 0) & (F.col('balls') == 6)
).groupBy('bowler').agg(
    F.count('bowler').alias('M')
)

# Join bowling stats with maiden overs
bowling_scorecard_final = scorecard_bowler_df.join(
    maiden_bowler_df, 
    on=['bowler'], 
    how='left'
).fillna(value=0).select('bowler', 'O', 'M', 'R', 'W', 'Econ')

print("\n--- BOWLING SCORECARD (First Innings) ---")
bowling_scorecard_final.show()


--- BOWLING SCORECARD (First Innings) ---
+------------+---+---+---+---+----+
|      bowler|  O|  M|  R|  W|Econ|
+------------+---+---+---+---+----+
|Harshit Rana|4.0|  1| 24|  2| 6.0|
|   SP Narine|4.0|  0| 16|  1| 5.0|
|    MA Starc|3.0|  0| 14|  2|4.67|
|  AD Russell|2.3|  0| 19|  3| 7.6|
|    VG Arora|3.0|  0| 24|  1|8.67|
|    CV Varun|2.0|  0|  9|  1| 5.0|
+------------+---+---+---+---+----+

+------------+---+---+---+---+----+
|      bowler|  O|  M|  R|  W|Econ|
+------------+---+---+---+---+----+
|Harshit Rana|4.0|  1| 24|  2| 6.0|
|   SP Narine|4.0|  0| 16|  1| 5.0|
|    MA Starc|3.0|  0| 14|  2|4.67|
|  AD Russell|2.3|  0| 19|  3| 7.6|
|    VG Arora|3.0|  0| 24|  1|8.67|
|    CV Varun|2.0|  0|  9|  1| 5.0|
+------------+---+---+---+---+----+



In [89]:
# Save results to CSV
"""
1. show(): Display final results
2. Multiple DataFrame displays
3. Final scorecards presentation

- Displays complete batting scorecard
- Shows final bowling figures
- Provides match summary
- Complete first innings analysis
- Both batting and bowling perspectives
- Ready for further analysis or export
"""

print("\n--- BATTING SCORECARD (First Innings) ---")
batting_scorecard_final.show()

print("\n--- BOWLING SCORECARD (First Innings) ---")
bowling_scorecard_final.show()


--- BATTING SCORECARD (First Innings) ---
+-------------+-------------------+----+-----+---+---+------+
|batting_order|             batter|runs|balls| 4s| 6s|   S/R|
+-------------+-------------------+----+-----+---+---+------+
|            1|    Abhishek Sharma|   2|    5|  0|  0|  40.0|
|            2|        RA Tripathi|   9|   12|  1|  0|  75.0|
|            3|            TM Head|   0|    1|  0|  0|   0.0|
|            4|         AK Markram|  20|   23|  3|  0| 86.96|
|            5|Nithish Kumar Reddy|  13|   10|  1|  1| 130.0|
|            6|          H Klaasen|  16|   17|  1|  0| 94.12|
|            7|      Shahbaz Ahmed|   8|    7|  0|  1|114.29|
|            8|        Abdul Samad|   4|    4|  0|  0| 100.0|
|            9|         PJ Cummins|  24|   17|  2|  1|141.18|
|           10|         JD Unadkat|   4|   11|  0|  0| 36.36|
|           11|            B Kumar|   0|    1|  0|  0|   0.0|
+-------------+-------------------+----+-----+---+---+------+


--- BOWLING SCORECARD (Fi

In [90]:
"""
1. spark.stop(): Properly close Spark session
2. Resource cleanup
3. Memory management
- Ensures proper cleanup of resources
- Prevents memory leaks
"""

# Stop Spark Session
spark.stop()