In [4]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib



# Load datasets
matches = pd.read_csv('/content/matches.csv')
balls = pd.read_csv('/content/ball.csv')

# Preprocess data
inningScores = balls.groupby(['ID', 'innings']).sum()['total_run'].reset_index()
inningScores = inningScores[inningScores['innings'] == 1]
inningScores['target'] = inningScores['total_run'] + 1
matches = matches.merge(inningScores[['ID', 'target']], on='ID')

# Standardize team names
team_name_changes = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Kings XI Punjab': 'Punjab Kings',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Rising Pune Supergiant': 'Pune Warriors',
    'Rising Pune Supergiants': 'Pune Warriors',
    'Pune Warriorss': 'Pune Warriors',
    'Gujarat Lions': 'Gujarat Titans'
}

for old_name, new_name in team_name_changes.items():
    matches['Team1'] = matches['Team1'].str.replace(old_name, new_name)
    matches['Team2'] = matches['Team2'].str.replace(old_name, new_name)
    matches['WinningTeam'] = matches['WinningTeam'].str.replace(old_name, new_name)
    balls['BattingTeam'] = balls['BattingTeam'].str.replace(old_name, new_name)

teams2023 = [
    'Rajasthan Royals', 'Royal Challengers Bangalore', 'Sunrisers Hyderabad',
    'Delhi Capitals', 'Chennai Super Kings', 'Gujarat Titans',
    'Lucknow Super Giants', 'Kolkata Knight Riders', 'Punjab Kings',
    'Mumbai Indians'
]

matches = matches[matches['Team1'].isin(teams2023)]
matches = matches[matches['Team2'].isin(teams2023)]
matches = matches[matches['WinningTeam'].isin(teams2023)]
matches = matches[['ID', 'City', 'Team1', 'Team2', 'WinningTeam', 'target']].dropna()

balls = balls[balls['BattingTeam'].isin(teams2023)]

# Merge datasets
final = matches.merge(balls, on='ID')
final = final[final['innings'] == 2]
final['current_score'] = final.groupby('ID')['total_run'].cumsum()
final['runs_left'] = np.where(final['target'] - final['current_score'] >= 0, final['target'] - final['current_score'], 0)
final['balls_left'] = np.where(120 - final['overs'] * 6 - final['ballnumber'] >= 0, 120 - final['overs'] * 6 - final['ballnumber'], 0)
final['wickets_left'] = 10 - final.groupby('ID')['isWicketDelivery'].cumsum()
final['current_run_rate'] = (final['current_score'] * 6) / (120 - final['balls_left'])
final['required_run_rate'] = np.where(final['balls_left'] > 0, final['runs_left'] * 6 / final['balls_left'], 0)

def result(row):
    return 1 if row['BattingTeam'] == row['WinningTeam'] else 0

final['result'] = final.apply(result, axis=1)

index1 = final[final['Team2'] == final['BattingTeam']]['Team1'].index
index2 = final[final['Team1'] == final['BattingTeam']]['Team2'].index
final.loc[index1, 'BowlingTeam'] = final.loc[index1, 'Team1']
final.loc[index2, 'BowlingTeam'] = final.loc[index2, 'Team2']

winningPred = final[['BattingTeam', 'BowlingTeam', 'City', 'runs_left', 'balls_left', 'wickets_left', 'current_run_rate', 'required_run_rate', 'target', 'result']]

# Prepare data for training
X = winningPred.drop('result', axis=1)
y = winningPred['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['BattingTeam', 'BowlingTeam', 'City'])
], remainder='passthrough')

pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', RandomForestClassifier())
])

pipe.fit(X_train, y_train)

# Save the model
joblib.dump(pipe, 'pipe.pkl')


['pipe.pkl']