# IPL Data Visualization Analysis

This notebook focuses on creating insightful visualizations from IPL match data.

In [16]:
# Import required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window
import pyspark.sql.functions as F

# For visualizations
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IPL Visualizations") \
    .master("local[*]") \
    .getOrCreate()

# Load the data
deliveries_df = spark.read.csv('data/deliveries.csv', header=True, inferSchema=True)
matches_df = spark.read.csv('data/matches.csv', header=True, inferSchema=True)

## 1. Match Timeline Analysis

In [17]:
# Calculate cumulative runs per over
runs_timeline = deliveries_df.filter('match_id == 1426312').groupBy('inning', 'over') \
    .agg(F.sum('total_runs').alias('runs_in_over')) \
    .orderBy('inning', 'over')

# Convert to Pandas for plotting
runs_timeline_pd = runs_timeline.toPandas()
runs_timeline_pd['cumulative_runs'] = runs_timeline_pd.groupby('inning')['runs_in_over'].cumsum()

# Create the plot
fig = px.line(runs_timeline_pd, x='over', y='cumulative_runs', color='inning',
              title='Match Timeline: Cumulative Runs per Over',
              labels={'over': 'Over Number', 'cumulative_runs': 'Cumulative Runs', 'inning': 'Innings'})
fig.show()

## 2. Top Performers Analysis

In [18]:
# Top 6 Batsmen
top_batsmen = deliveries_df.groupBy('batter') \
    .agg(F.sum('batsman_runs').alias('total_runs'),
         F.count('ball').alias('balls_faced')) \
    .withColumn('strike_rate', F.round(F.col('total_runs') * 100 / F.col('balls_faced'), 2)) \
    .orderBy(F.desc('total_runs')) \
    .limit(6)

# Plot using Plotly
batsmen_fig = px.bar(top_batsmen.toPandas(), 
                     x='batter', y='total_runs',
                     title='Top 6 Batsmen by Runs',
                     text='strike_rate',
                     labels={'batter': 'Batsman', 'total_runs': 'Total Runs', 'strike_rate': 'Strike Rate'})
batsmen_fig.show()

In [19]:
# Top 5 Bowlers
top_bowlers = deliveries_df.groupBy('bowler') \
    .agg(F.sum('is_wicket').alias('wickets'),
         F.sum('total_runs').alias('runs_conceded'),
         F.count('ball').alias('balls_bowled')) \
    .withColumn('economy', F.round(F.col('runs_conceded') * 6 / F.col('balls_bowled'), 2)) \
    .orderBy(F.desc('wickets')) \
    .limit(5)

# Plot using Plotly
bowlers_fig = px.bar(top_bowlers.toPandas(),
                     x='bowler', y=['wickets', 'economy'],
                     title='Top 5 Bowlers by Wickets',
                     barmode='group',
                     labels={'bowler': 'Bowler', 'value': 'Count', 'variable': 'Metric'})
bowlers_fig.show()

## 3. Toss Effect Analysis

In [20]:
# Analyze toss decisions and match results
toss_analysis = matches_df.withColumn('toss_winner_is_match_winner', 
                                      F.when(F.col('toss_winner') == F.col('winner'), 1).otherwise(0))

toss_stats = toss_analysis.groupBy('toss_decision') \
    .agg(F.avg('toss_winner_is_match_winner').alias('win_percentage'),
         F.count('*').alias('total_matches'))

# Create visualization
toss_fig = px.bar(toss_stats.toPandas(),
                  x='toss_decision', y='win_percentage',
                  title='Win Percentage Based on Toss Decision',
                  text=toss_stats.toPandas()['win_percentage'].apply(lambda x: f'{x*100:.1f}%'),
                  labels={'toss_decision': 'Toss Decision', 
                         'win_percentage': 'Win Percentage'})
toss_fig.show()

## 4. Season Trend Analysis

In [21]:
# Calculate average innings score per season
innings_scores = deliveries_df.join(matches_df, deliveries_df.match_id == matches_df.id) \
    .groupBy('season', 'match_id', 'inning') \
    .agg(F.sum('total_runs').alias('innings_total'))

season_avg = innings_scores.groupBy('season') \
    .agg(F.avg('innings_total').alias('avg_score')) \
    .orderBy('season')

# Create trend visualization
season_fig = px.line(season_avg.toPandas(),
                     x='season', y='avg_score',
                     title='Average Innings Score by Season',
                     labels={'season': 'Season',
                            'avg_score': 'Average Score'})
season_fig.show()

In [22]:
# Stop Spark Session
spark.stop()