# IPL Business Intelligence Analysis

This notebook focuses on creating detailed BI insights and reports from IPL data.

In [None]:
# Import required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IPL BI Analysis") \
    .master("local[*]") \
    .getOrCreate()

# Load the data
deliveries_df = spark.read.csv('data/deliveries.csv', header=True, inferSchema=True)
matches_df = spark.read.csv('data/matches.csv', header=True, inferSchema=True)

## 1. Per-Match Analysis

In [None]:
# Calculate per-match metrics
match_metrics = deliveries_df.groupBy('match_id', 'inning') \
    .agg(F.sum('total_runs').alias('innings_runs'),
         F.sum('is_wicket').alias('innings_wickets'),
         F.count(F.when(F.col('extras_type').isNotNull(), 1)).alias('extras_count'))

# Calculate runs per over
runs_per_over = deliveries_df.groupBy('match_id', 'inning', 'over') \
    .agg(F.sum('total_runs').alias('over_runs'))

# Calculate highest partnerships
partnerships = deliveries_df.groupBy('match_id', 'inning', 'batter', 'non_striker') \
    .agg(F.sum('total_runs').alias('partnership_runs')) \
    .groupBy('match_id', 'inning') \
    .agg(F.max('partnership_runs').alias('highest_partnership'))

print("\n--- Match Metrics Sample ---")
match_metrics.show(5)

## 2. Per-Player Analysis

In [None]:
# Join with matches for season-wise analysis
player_stats = deliveries_df.join(matches_df, deliveries_df.match_id == matches_df.id)

# Batting statistics
batting_stats = player_stats.groupBy('season', 'batter') \
    .agg(F.sum('batsman_runs').alias('total_runs'),
         F.count('ball').alias('balls_faced'),
         F.count(F.when(F.col('batsman_runs') == 4, 1)).alias('fours'),
         F.count(F.when(F.col('batsman_runs') == 6, 1)).alias('sixes'),
         F.sum('is_wicket').alias('dismissals')) \
    .withColumn('average', F.round(F.col('total_runs') / F.col('dismissals'), 2)) \
    .withColumn('strike_rate', F.round(F.col('total_runs') * 100 / F.col('balls_faced'), 2))

print("\n--- Batting Statistics Sample ---")
batting_stats.orderBy(F.desc('total_runs')).show(5)

In [None]:
# Bowling statistics
bowling_stats = player_stats.groupBy('season', 'bowler') \
    .agg(F.sum('is_wicket').alias('wickets'),
         F.count('ball').alias('balls_bowled'),
         F.sum('total_runs').alias('runs_conceded'),
         F.count(F.when(F.col('batsman_runs') == 0, 1)).alias('dot_balls')) \
    .withColumn('economy', F.round(F.col('runs_conceded') * 6 / F.col('balls_bowled'), 2)) \
    .withColumn('dot_ball_percentage', F.round(F.col('dot_balls') * 100 / F.col('balls_bowled'), 2))

print("\n--- Bowling Statistics Sample ---")
bowling_stats.orderBy(F.desc('wickets')).show(5)

## 3. Head-to-Head Analysis

In [None]:
# Batsman vs Bowler analysis
head_to_head = deliveries_df.groupBy('batter', 'bowler') \
    .agg(F.sum('batsman_runs').alias('runs'),
         F.count('ball').alias('balls'),
         F.sum('is_wicket').alias('dismissals')) \
    .withColumn('strike_rate', F.round(F.col('runs') * 100 / F.col('balls'), 2))

print("\n--- Head to Head Analysis Sample ---")
head_to_head.orderBy(F.desc('balls')).show(5)

## 4. Match State Analysis

In [None]:
# Create window for running calculations
match_window = Window.partitionBy('match_id', 'inning').orderBy('over', 'ball')

# Calculate match state features
match_state = deliveries_df \
    .withColumn('current_score', F.sum('total_runs').over(match_window)) \
    .withColumn('wickets_in_hand', 10 - F.sum('is_wicket').over(match_window)) \
    .withColumn('balls_completed', F.count('*').over(match_window)) \
    .withColumn('run_rate', F.round(F.col('current_score') * 6 / F.col('balls_completed'), 2))

print("\n--- Match State Analysis Sample ---")
match_state.select('match_id', 'inning', 'over', 'current_score', 
                  'wickets_in_hand', 'run_rate').show(5)

In [None]:
# Stop Spark Session
spark.stop()