# IPL Machine Learning Analysis

This notebook focuses on predictive analysis using scikit-learn.

In [None]:
# Import required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window
import pyspark.sql.functions as F

# For ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IPL ML Analysis") \
    .master("local[*]") \
    .getOrCreate()

# Load the data
deliveries_df = spark.read.csv('data/deliveries.csv', header=True, inferSchema=True)
matches_df = spark.read.csv('data/matches.csv', header=True, inferSchema=True)

## 1. Feature Engineering

In [None]:
# Create features for score prediction
def create_features_first_innings(df_deliveries, df_matches):
    # Join dataframes
    combined_df = df_deliveries.join(df_matches, df_deliveries.match_id == df_matches.id)
    
    # Calculate features for first 6 overs
    powerplay_stats = combined_df.filter(F.col('over') <= 6) \
        .groupBy('match_id') \
        .agg(F.sum('total_runs').alias('powerplay_runs'),
             F.sum('is_wicket').alias('powerplay_wickets'),
             F.count(F.when(F.col('extras_type').isNotNull(), 1)).alias('powerplay_extras'))
    
    # Calculate final innings score
    final_scores = combined_df.groupBy('match_id') \
        .agg(F.sum('total_runs').alias('final_score'))
    
    # Combine features
    feature_df = powerplay_stats.join(final_scores, 'match_id')
    
    return feature_df

# Create features for match winner prediction
def create_features_match_winner(df_matches):
    # Calculate team form (last 3 matches)
    matches_pd = df_matches.toPandas()
    matches_pd['team1_last3'] = matches_pd.groupby('team1')['winner'].transform(
        lambda x: x.shift().rolling(3).mean())
    matches_pd['team2_last3'] = matches_pd.groupby('team2')['winner'].transform(
        lambda x: x.shift().rolling(3).mean())
    
    return matches_pd

# Create feature datasets
score_features = create_features_first_innings(deliveries_df, matches_df)
# Fix: Ensure 'winner' is numeric before calculating rolling mean
matches_pd = matches_df.toPandas()
matches_pd['winner_numeric'] = (matches_pd['winner'] == matches_pd['team1']).astype(int)

matches_pd['team1_last3'] = matches_pd.groupby('team1')['winner_numeric'].transform(
    lambda x: x.shift().rolling(3).mean())
matches_pd['team2_last3'] = matches_pd.groupby('team2')['winner_numeric'].transform(
    lambda x: x.shift().rolling(3).mean())

winner_features = matches_pd

print("\n--- Score Prediction Features Sample ---")
score_features.show(5)

print("\n--- Match Winner Features Sample ---")
print(winner_features.head())

## 2. Final Score Prediction Model

In [None]:
# Prepare data for score prediction
score_data = score_features.toPandas()

X = score_data[['powerplay_runs', 'powerplay_wickets', 'powerplay_extras']]
y = score_data['final_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
score_model = RandomForestRegressor(n_estimators=100, random_state=42)
score_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = score_model.predict(X_test_scaled)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Square Error: {rmse}')

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': score_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))

## 3. Match Winner Prediction Model

In [None]:
# Prepare data for winner prediction
winner_data = winner_features.dropna()

X = winner_data[['team1_last3', 'team2_last3', 'toss_winner']]
y = winner_data['winner']

# Convert categorical variables
X = pd.get_dummies(X, columns=['toss_winner'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
winner_model = RandomForestClassifier(n_estimators=100, random_state=42)
winner_model.fit(X_train, y_train)

# Make predictions
y_pred = winner_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Stop Spark Session
spark.stop()