# üèè IPL Score Prediction - Feature Engineering

This notebook demonstrates feature engineering techniques for IPL score prediction.

**Author:** IPL Score Prediction Team  
**Date:** 2024

## 1. Setup and Imports

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys

# Add src to path
sys.path.append('../src')

# Import custom modules
from data_preprocessing import IPLDataLoader, IPLDataCleaner
from feature_engineering import (
    MatchStateFeatures, RunRateFeatures, PhaseFeatures,
    PlayerFeatures, TeamFeatures, VenueFeatures,
    EncodingFeatures, FeatureEngineer
)

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("‚úÖ Libraries imported successfully!")

## 2. Load Preprocessed Data

In [None]:
# Load data
loader = IPLDataLoader(data_path='../data/')
ball_df = loader.load_ball_by_ball_data()

# Clean data
cleaner = IPLDataCleaner()
ball_df = cleaner.clean_data(ball_df)

print(f"‚úÖ Data loaded: {len(ball_df):,} records")
ball_df.head()

## 3. Feature Categories Overview

We will create the following feature categories:

1. **Match State Features**: Current score, wickets, balls remaining
2. **Run Rate Features**: Current, required, and phase run rates
3. **Phase Features**: Powerplay, middle overs, death overs indicators
4. **Player Features**: Batsman/bowler statistics
5. **Team Features**: Historical team performance
6. **Venue Features**: Ground-specific statistics
7. **Encoding Features**: Categorical encoding

## 4. Match State Features

In [None]:
# Create match state features
match_state = MatchStateFeatures()
df_match_state = match_state.create_features(ball_df.copy())

# Display new features
match_state_cols = [col for col in df_match_state.columns if col not in ball_df.columns]
print(f"\nüìä Match State Features Created: {len(match_state_cols)}")
print("\nNew columns:")
for col in match_state_cols:
    print(f"  - {col}")

In [None]:
# Visualize match progression
sample_match = df_match_state[df_match_state['match_id'] == df_match_state['match_id'].iloc[0]]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Score progression
axes[0, 0].plot(sample_match['balls_played'], sample_match['cumulative_runs'], 'b-', linewidth=2)
axes[0, 0].set_xlabel('Balls Played')
axes[0, 0].set_ylabel('Cumulative Runs')
axes[0, 0].set_title('Score Progression')

# Wickets fallen
axes[0, 1].plot(sample_match['balls_played'], sample_match['wickets_fallen'], 'r-', linewidth=2)
axes[0, 1].set_xlabel('Balls Played')
axes[0, 1].set_ylabel('Wickets')
axes[0, 1].set_title('Wickets Progression')

# Balls remaining
axes[1, 0].plot(sample_match['balls_played'], sample_match['balls_remaining'], 'g-', linewidth=2)
axes[1, 0].set_xlabel('Balls Played')
axes[1, 0].set_ylabel('Balls Remaining')
axes[1, 0].set_title('Balls Remaining')

# Wickets in hand
axes[1, 1].plot(sample_match['balls_played'], sample_match['wickets_in_hand'], 'm-', linewidth=2)
axes[1, 1].set_xlabel('Balls Played')
axes[1, 1].set_ylabel('Wickets in Hand')
axes[1, 1].set_title('Wickets in Hand')

plt.tight_layout()
plt.show()

## 5. Run Rate Features

In [None]:
# Create run rate features
run_rate = RunRateFeatures()
df_run_rate = run_rate.create_features(df_match_state.copy())

# Display new features
run_rate_cols = [col for col in df_run_rate.columns if col not in df_match_state.columns]
print(f"\nüìä Run Rate Features Created: {len(run_rate_cols)}")
print("\nNew columns:")
for col in run_rate_cols:
    print(f"  - {col}")

In [None]:
# Visualize run rates
sample_match = df_run_rate[df_run_rate['match_id'] == df_run_rate['match_id'].iloc[0]]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Current run rate progression
axes[0].plot(sample_match['over'], sample_match['current_run_rate'], 'b-', linewidth=2, label='Current RR')
if 'run_rate_last_5_overs' in sample_match.columns:
    axes[0].plot(sample_match['over'], sample_match['run_rate_last_5_overs'], 'r--', linewidth=2, label='Last 5 Overs RR')
axes[0].set_xlabel('Over')
axes[0].set_ylabel('Run Rate')
axes[0].set_title('Run Rate Progression')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Phase-wise run rates
if 'powerplay_run_rate' in df_run_rate.columns:
    phase_rr = df_run_rate.groupby('match_id')[['powerplay_run_rate', 'middle_run_rate', 'death_run_rate']].last()
    phase_rr.mean().plot(kind='bar', ax=axes[1], color=['green', 'blue', 'red'], edgecolor='black')
    axes[1].set_ylabel('Average Run Rate')
    axes[1].set_title('Phase-wise Run Rates')
    axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Phase Features

In [None]:
# Create phase features
phase = PhaseFeatures()
df_phase = phase.create_features(df_run_rate.copy())

# Display new features
phase_cols = [col for col in df_phase.columns if col not in df_run_rate.columns]
print(f"\nüìä Phase Features Created: {len(phase_cols)}")
print("\nNew columns:")
for col in phase_cols:
    print(f"  - {col}")

In [None]:
# Visualize phase distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Phase indicator distribution
phase_counts = df_phase[['is_powerplay', 'is_middle_overs', 'is_death_overs']].sum()
phase_counts.index = ['Powerplay', 'Middle Overs', 'Death Overs']
colors = ['#10b981', '#3b82f6', '#ef4444']
axes[0].pie(phase_counts, labels=phase_counts.index, autopct='%1.1f%%', colors=colors)
axes[0].set_title('Distribution of Deliveries by Phase')

# Average runs by phase
phase_runs = pd.DataFrame({
    'Phase': ['Powerplay', 'Middle Overs', 'Death Overs'],
    'Avg Runs': [
        df_phase[df_phase['is_powerplay'] == 1]['runs_off_bat'].mean(),
        df_phase[df_phase['is_middle_overs'] == 1]['runs_off_bat'].mean(),
        df_phase[df_phase['is_death_overs'] == 1]['runs_off_bat'].mean()
    ]
})
axes[1].bar(phase_runs['Phase'], phase_runs['Avg Runs'], color=colors, edgecolor='black')
axes[1].set_ylabel('Average Runs per Ball')
axes[1].set_title('Average Runs by Phase')

plt.tight_layout()
plt.show()

## 7. Team Features

In [None]:
# Create team features
team = TeamFeatures()
df_team = team.create_features(df_phase.copy())

# Display new features
team_cols = [col for col in df_team.columns if col not in df_phase.columns]
print(f"\nüìä Team Features Created: {len(team_cols)}")
print("\nNew columns:")
for col in team_cols:
    print(f"  - {col}")

In [None]:
# Visualize team statistics
if 'team_avg_score' in df_team.columns:
    team_stats = df_team.groupby('batting_team')['team_avg_score'].first().sort_values(ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    team_stats.plot(kind='barh', ax=ax, color='steelblue', edgecolor='black')
    ax.set_xlabel('Average Score')
    ax.set_ylabel('Team')
    ax.set_title('Team Average Scores')
    ax.axvline(team_stats.mean(), color='red', linestyle='--', label=f'Overall Avg: {team_stats.mean():.1f}')
    ax.legend()
    plt.tight_layout()
    plt.show()

## 8. Venue Features

In [None]:
# Create venue features
venue = VenueFeatures()
df_venue = venue.create_features(df_team.copy())

# Display new features
venue_cols = [col for col in df_venue.columns if col not in df_team.columns]
print(f"\nüìä Venue Features Created: {len(venue_cols)}")
print("\nNew columns:")
for col in venue_cols:
    print(f"  - {col}")

In [None]:
# Visualize venue statistics
if 'venue_avg_score' in df_venue.columns:
    venue_stats = df_venue.groupby('venue')['venue_avg_score'].first().sort_values(ascending=True)
    
    fig, ax = plt.subplots(figsize=(12, 8))
    colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(venue_stats)))
    venue_stats.plot(kind='barh', ax=ax, color=colors, edgecolor='black')
    ax.set_xlabel('Average Score')
    ax.set_ylabel('Venue')
    ax.set_title('Venue Average Scores')
    ax.axvline(venue_stats.mean(), color='red', linestyle='--', label=f'Overall Avg: {venue_stats.mean():.1f}')
    ax.legend()
    plt.tight_layout()
    plt.show()

## 9. Feature Encoding

In [None]:
# Create encoding features
encoding = EncodingFeatures()
df_encoded = encoding.create_features(df_venue.copy())

# Display encoding summary
encoded_cols = [col for col in df_encoded.columns if col not in df_venue.columns]
print(f"\nüìä Encoded Features Created: {len(encoded_cols)}")
print("\nEncoded columns (sample):")
for col in encoded_cols[:10]:
    print(f"  - {col}")
if len(encoded_cols) > 10:
    print(f"  ... and {len(encoded_cols) - 10} more")

## 10. Complete Feature Engineering Pipeline

In [None]:
# Use the FeatureEngineer class for complete pipeline
feature_engineer = FeatureEngineer()

# Apply all feature engineering
df_final = feature_engineer.fit_transform(ball_df.copy())

print(f"\nüìä Feature Engineering Complete!")
print(f"   Original features: {len(ball_df.columns)}")
print(f"   Final features: {len(df_final.columns)}")
print(f"   New features added: {len(df_final.columns) - len(ball_df.columns)}")

In [None]:
# List all features by category
print("\nüìã All Features by Category:")
print("="*60)

original_cols = list(ball_df.columns)
new_cols = [col for col in df_final.columns if col not in original_cols]

categories = {
    'Match State': ['cumulative', 'wickets', 'balls', 'partnership'],
    'Run Rate': ['run_rate', '_rr_'],
    'Phase': ['powerplay', 'middle', 'death', 'phase'],
    'Team': ['team_'],
    'Venue': ['venue_'],
    'Encoded': ['_encoded', 'batting_team_', 'bowling_team_', 'venue_']
}

for category, keywords in categories.items():
    category_cols = [col for col in new_cols if any(kw in col.lower() for kw in keywords)]
    print(f"\n{category} Features ({len(category_cols)}):")
    for col in category_cols[:5]:
        print(f"  - {col}")
    if len(category_cols) > 5:
        print(f"  ... and {len(category_cols) - 5} more")

## 11. Feature Importance Analysis

In [None]:
# Calculate correlation with target
target_col = 'total_runs'

# Select only numerical columns
numerical_cols = df_final.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlations
correlations = df_final[numerical_cols].corr()[target_col].abs().sort_values(ascending=False)

# Display top correlations
print("\nüìä Top 20 Features Correlated with Target (total_runs):")
print("="*50)
top_correlations = correlations.head(21)[1:]  # Exclude self-correlation
for feature, corr in top_correlations.items():
    print(f"  {feature}: {corr:.4f}")

In [None]:
# Visualize feature correlations
fig, ax = plt.subplots(figsize=(12, 8))

top_20 = correlations.head(21)[1:]  # Exclude self
colors = plt.cm.coolwarm(np.linspace(0.8, 0.2, len(top_20)))

top_20.plot(kind='barh', ax=ax, color=colors, edgecolor='black')
ax.set_xlabel('Absolute Correlation')
ax.set_ylabel('Feature')
ax.set_title('Top 20 Features by Correlation with Target')

plt.tight_layout()
plt.show()

## 12. Feature Selection

In [None]:
# Select features for model training
feature_cols = feature_engineer.get_feature_columns()

print(f"\nüìä Selected Features for Model: {len(feature_cols)}")
print("\nFeatures:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Prepare final dataset
X = df_final[feature_cols].copy()
y = df_final['total_runs'].copy()

# Handle any remaining missing values
X = X.fillna(0)

print(f"\nüìä Final Dataset Shape:")
print(f"   X: {X.shape}")
print(f"   y: {y.shape}")
print(f"\n   Features: {X.shape[1]}")
print(f"   Samples: {X.shape[0]:,}")

## 13. Save Engineered Features

In [None]:
import joblib

# Save feature engineer
os.makedirs('../models', exist_ok=True)
joblib.dump(feature_engineer, '../models/feature_engineer.pkl')

# Save processed data
os.makedirs('../data/processed', exist_ok=True)
df_final.to_csv('../data/processed/features_engineered.csv', index=False)

print("\n‚úÖ Feature engineering artifacts saved!")
print("\n   - Feature engineer: models/feature_engineer.pkl")
print("   - Processed data: data/processed/features_engineered.csv")

## 14. Summary

In [None]:
print("="*60)
print("üìä FEATURE ENGINEERING SUMMARY")
print("="*60)

print(f"\nüìÅ Dataset:")
print(f"   - Original features: {len(ball_df.columns)}")
print(f"   - Engineered features: {len(df_final.columns)}")
print(f"   - Total samples: {len(df_final):,}")

print(f"\nüìà Feature Categories:")
print(f"   - Match State: Score progression, wickets, balls")
print(f"   - Run Rate: Current RR, phase RR, momentum")
print(f"   - Phase: Powerplay, middle, death indicators")
print(f"   - Team: Historical team performance")
print(f"   - Venue: Ground-specific statistics")
print(f"   - Encoded: Categorical encodings")

print(f"\nüéØ Target Variable:")
print(f"   - Name: total_runs (cumulative innings score)")
print(f"   - Mean: {y.mean():.2f}")
print(f"   - Std: {y.std():.2f}")

print("\n" + "="*60)
print("‚úÖ Feature engineering complete! Ready for model training.")
print("="*60)