In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

# Load data
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

# Drop incomplete matches
matches = matches.dropna(subset=['winner'])
matches = matches[matches['team1'] != matches['team2']]

# Feature: batting first logic
def identify_bat_first(row):
    if row['toss_decision'] == 'bat':
        return row['toss_winner']
    else:
        return row['team2'] if row['toss_winner'] == row['team1'] else row['team1']
matches['bat_first'] = matches.apply(identify_bat_first, axis=1)

# Merge total runs per team
total_runs = deliveries.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
total_pivot = total_runs.pivot(index='match_id', columns='batting_team', values='total_runs').fillna(0)

def get_team_runs(row, team_col):
    try:
        return total_pivot.loc[row['id'], row[team_col]]
    except:
        return 0

matches['total_runs_team1'] = matches.apply(lambda x: get_team_runs(x, 'team1'), axis=1)
matches['total_runs_team2'] = matches.apply(lambda x: get_team_runs(x, 'team2'), axis=1)

# Bowler economy rate
bowler_runs = deliveries.groupby(['match_id', 'bowler'])['total_runs'].sum()
bowler_balls = deliveries.groupby(['match_id', 'bowler']).size()
bowler_economy = (bowler_runs / (bowler_balls / 6)).groupby('match_id').mean()

# Batsman strike rate
batsman_runs = deliveries.groupby(['match_id', 'batsman'])['batsman_runs'].sum()
batsman_balls = deliveries.groupby(['match_id', 'batsman']).size()
batsman_strike = (batsman_runs / batsman_balls * 100).groupby('match_id').mean()

matches['economy_rate'] = matches['id'].map(bowler_economy)
matches['strike_rate'] = matches['id'].map(batsman_strike)

# Fill missing values gracefully
matches['economy_rate'].fillna(matches['economy_rate'].mean(), inplace=True)
matches['strike_rate'].fillna(matches['strike_rate'].mean(), inplace=True)

# Label encode categorical columns
encoders = {}
for col in ['team1', 'team2', 'bat_first', 'venue', 'winner']:
    le = LabelEncoder()
    matches[col] = le.fit_transform(matches[col])
    encoders[col] = le

# Final feature set
features = ['team1', 'team2', 'bat_first', 'venue',
            'total_runs_team1', 'total_runs_team2',
            'economy_rate', 'strike_rate']
X = matches[features]
y = matches['winner']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model and encoders
with open('../backend/model/ipl_win_predictor.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../backend/model/encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)
