# üèè IPL Score Prediction - Data Exploration

This notebook explores the IPL dataset and prepares it for deep learning models.

**Author:** IPL Score Prediction Team  
**Date:** 2024

## 1. Setup and Imports

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys

# Add src to path
sys.path.append('../src')

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

print("‚úÖ Libraries imported successfully!")

## 2. Load Data

In [None]:
# Import data loading module
from data_preprocessing import IPLDataLoader

# Initialize loader
loader = IPLDataLoader(data_path='../data/')

# Load ball-by-ball data
ball_df = loader.load_ball_by_ball_data()

# Load match data
match_df = loader.load_match_data()

print(f"\nüìä Ball-by-ball data shape: {ball_df.shape}")
print(f"üìä Match data shape: {match_df.shape}")

## 3. Data Overview

In [None]:
# Ball-by-ball data info
print("\nüìã Ball-by-Ball Data Info:")
print("="*50)
ball_df.info()

In [None]:
# First few rows
print("\nüìã Ball-by-Ball Sample:")
ball_df.head(10)

In [None]:
# Match data info
print("\nüìã Match Data Info:")
print("="*50)
match_df.info()

In [None]:
# Match data sample
print("\nüìã Match Data Sample:")
match_df.head()

## 4. Statistical Summary

In [None]:
# Numerical statistics
print("\nüìä Ball-by-Ball Statistical Summary:")
ball_df.describe()

In [None]:
# Match statistics
print("\nüìä Match Statistical Summary:")
match_df.describe()

## 5. Missing Values Analysis

In [None]:
# Check missing values
print("\nüîç Missing Values - Ball-by-Ball Data:")
missing_ball = ball_df.isnull().sum()
missing_pct = (missing_ball / len(ball_df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_ball,
    'Missing %': missing_pct
})
missing_df[missing_df['Missing Count'] > 0]

In [None]:
# Visualize missing values
fig, ax = plt.subplots(figsize=(12, 6))

missing_pct_sorted = missing_pct.sort_values(ascending=False)
missing_pct_sorted = missing_pct_sorted[missing_pct_sorted > 0]

if len(missing_pct_sorted) > 0:
    missing_pct_sorted.plot(kind='bar', ax=ax, color='coral')
    ax.set_ylabel('Missing Percentage (%)')
    ax.set_title('Missing Values by Column')
    plt.xticks(rotation=45, ha='right')
else:
    ax.text(0.5, 0.5, 'No missing values found!', ha='center', va='center', fontsize=14)
    ax.set_title('Missing Values Analysis')

plt.tight_layout()
plt.show()

## 6. Team Analysis

In [None]:
# Unique teams
teams = ball_df['batting_team'].unique()
print(f"\nüèè Number of Teams: {len(teams)}")
print("\nTeams:")
for team in sorted(teams):
    print(f"  - {team}")

In [None]:
# Team performance - Total runs scored
team_runs = ball_df.groupby('batting_team').agg({
    'total_runs': 'sum',
    'match_id': 'nunique',
    'is_wicket': 'sum'
}).reset_index()

team_runs.columns = ['Team', 'Total Runs', 'Matches', 'Wickets Lost']
team_runs['Avg Score per Match'] = (team_runs['Total Runs'] / team_runs['Matches']).round(1)

team_runs.sort_values('Avg Score per Match', ascending=False)

In [None]:
# Visualize team performance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Average score per match
team_runs_sorted = team_runs.sort_values('Avg Score per Match', ascending=True)

axes[0].barh(team_runs_sorted['Team'], team_runs_sorted['Avg Score per Match'], color='steelblue')
axes[0].set_xlabel('Average Score per Match')
axes[0].set_title('Team Average Scores')

# Matches played
axes[1].barh(team_runs_sorted['Team'], team_runs_sorted['Matches'], color='coral')
axes[1].set_xlabel('Number of Matches')
axes[1].set_title('Matches Played per Team')

plt.tight_layout()
plt.show()

## 7. Venue Analysis

In [None]:
# Unique venues
venues = ball_df['venue'].unique()
print(f"\nüèüÔ∏è Number of Venues: {len(venues)}")
print("\nVenues:")
for venue in sorted(venues):
    print(f"  - {venue}")

In [None]:
# Venue statistics
# Calculate innings totals first
innings_totals = ball_df.groupby(['match_id', 'innings', 'venue']).agg({
    'total_runs': 'max'
}).reset_index()

venue_stats = innings_totals.groupby('venue').agg({
    'total_runs': ['mean', 'max', 'min', 'std'],
    'match_id': 'nunique'
}).round(1)

venue_stats.columns = ['Avg Score', 'Max Score', 'Min Score', 'Score Std', 'Innings']
venue_stats.sort_values('Avg Score', ascending=False)

In [None]:
# Venue score distribution
fig, ax = plt.subplots(figsize=(12, 6))

venue_avg = venue_stats['Avg Score'].sort_values(ascending=True)

colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(venue_avg)))
ax.barh(range(len(venue_avg)), venue_avg.values, color=colors)
ax.set_yticks(range(len(venue_avg)))
ax.set_yticklabels(venue_avg.index)
ax.set_xlabel('Average Score')
ax.set_title('Average Innings Score by Venue')
ax.axvline(venue_avg.mean(), color='red', linestyle='--', label=f'Overall Avg: {venue_avg.mean():.1f}')
ax.legend()

plt.tight_layout()
plt.show()

## 8. Score Distribution Analysis

In [None]:
# Get final scores for each innings
final_scores = innings_totals['total_runs']

print(f"\nüìä Score Statistics:")
print(f"   Mean: {final_scores.mean():.1f}")
print(f"   Median: {final_scores.median():.1f}")
print(f"   Std Dev: {final_scores.std():.1f}")
print(f"   Min: {final_scores.min()}")
print(f"   Max: {final_scores.max()}")

In [None]:
# Score distribution plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(final_scores, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(final_scores.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {final_scores.mean():.1f}')
axes[0].axvline(final_scores.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {final_scores.median():.1f}')
axes[0].set_xlabel('Final Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Innings Scores')
axes[0].legend()

# Box plot
axes[1].boxplot(final_scores, vert=True)
axes[1].set_ylabel('Score')
axes[1].set_title('Score Box Plot')

plt.tight_layout()
plt.show()

## 9. Runs Per Over Analysis

In [None]:
# Runs per over
runs_per_over = ball_df.groupby('over').agg({
    'runs_off_bat': 'sum',
    'extras': 'sum',
    'is_wicket': 'sum'
}).reset_index()

runs_per_over['total_runs'] = runs_per_over['runs_off_bat'] + runs_per_over['extras']
runs_per_over['deliveries'] = ball_df.groupby('over').size().values
runs_per_over['avg_runs'] = (runs_per_over['total_runs'] / runs_per_over['deliveries']) * 6
runs_per_over['wicket_rate'] = runs_per_over['is_wicket'] / runs_per_over['deliveries'] * 100

runs_per_over

In [None]:
# Visualize runs per over
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Average runs per over
colors = ['#10b981' if x <= 6 else '#3b82f6' if x <= 15 else '#ef4444' for x in runs_per_over['over']]
axes[0].bar(runs_per_over['over'], runs_per_over['avg_runs'], color=colors, edgecolor='black')
axes[0].axhline(runs_per_over['avg_runs'].mean(), color='red', linestyle='--', label=f'Avg: {runs_per_over["avg_runs"].mean():.2f}')
axes[0].set_xlabel('Over')
axes[0].set_ylabel('Average Runs')
axes[0].set_title('Average Runs per Over\n(Green: Powerplay, Blue: Middle, Red: Death)')
axes[0].legend()

# Wicket rate per over
axes[1].bar(runs_per_over['over'], runs_per_over['wicket_rate'], color='coral', edgecolor='black')
axes[1].set_xlabel('Over')
axes[1].set_ylabel('Wicket Rate (%)')
axes[1].set_title('Wicket Probability per Over')

plt.tight_layout()
plt.show()

## 10. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numerical_cols = ['over', 'ball', 'runs_off_bat', 'extras', 'total_runs', 'wickets_fallen', 'is_wicket']
available_cols = [col for col in numerical_cols if col in ball_df.columns]

# Calculate correlation matrix
corr_matrix = ball_df[available_cols].corr()

# Plot heatmap
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, 
            ax=ax, fmt='.2f', square=True, linewidths=0.5)
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 11. Key Insights Summary

In [None]:
print("="*60)
print("üìä DATA EXPLORATION SUMMARY")
print("="*60)

print(f"\nüìÅ Dataset Size:")
print(f"   - Ball-by-ball records: {len(ball_df):,}")
print(f"   - Match records: {len(match_df):,}")
print(f"   - Unique matches: {ball_df['match_id'].nunique():,}")

print(f"\nüèè Teams:")
print(f"   - Number of teams: {len(teams)}")

print(f"\nüèüÔ∏è Venues:")
print(f"   - Number of venues: {len(venues)}")

print(f"\nüìà Score Statistics:")
print(f"   - Average innings score: {final_scores.mean():.1f}")
print(f"   - Highest score: {final_scores.max()}")
print(f"   - Lowest score: {final_scores.min()}")

print(f"\n‚ö° Phase Analysis:")
powerplay_avg = runs_per_over[runs_per_over['over'] <= 6]['avg_runs'].mean()
middle_avg = runs_per_over[(runs_per_over['over'] > 6) & (runs_per_over['over'] <= 15)]['avg_runs'].mean()
death_avg = runs_per_over[runs_per_over['over'] > 15]['avg_runs'].mean()
print(f"   - Powerplay (1-6): {powerplay_avg:.2f} runs/over")
print(f"   - Middle (7-15): {middle_avg:.2f} runs/over")
print(f"   - Death (16-20): {death_avg:.2f} runs/over")

print("\n" + "="*60)
print("‚úÖ Data exploration complete! Ready for feature engineering.")
print("="*60)

## 12. Save Processed Data

In [None]:
# Clean and save data
from data_preprocessing import IPLDataCleaner, preprocess_ipl_data

# Run full preprocessing pipeline
processed_data = preprocess_ipl_data(data_path='../data/', save_processed=True)

print("\n‚úÖ Data preprocessing complete!")
print("\nProcessed files saved to: data/processed/")