In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle

In [None]:
# Loading data
matches_data = pd.read_csv('matches.csv')
deliveries_data = pd.read_csv('deliveries.csv')

In [None]:
totalrun_df = deliveries_data.groupby(['match_id','inning']).sum()['total_runs'].reset_index()

totalrun_df.head()

In [None]:
totalrun_df = totalrun_df[totalrun_df['inning']==1]
totalrun_df['target_set'] = totalrun_df['total_runs'].apply(lambda x:x+1)
totalrun_df

In [None]:
# Replacing old team names with new ones
teams_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad'
}
matches_data.replace({'team1': teams_mapping, 'team2': teams_mapping}, inplace=True)

In [None]:
# Filtering frequently occurring teams
# Excluding teams like Kochi Tuskers, Pune Warriors, etc.
frequent_teams = [
    'Sunrisers Hyderabad', 'Mumbai Indians', 'Royal Challengers Bangalore',
    'Kolkata Knight Riders', 'Kings XI Punjab', 'Chennai Super Kings',
    'Rajasthan Royals', 'Delhi Capitals'
]
filtered_matches_data = matches_data[matches_data['team1'].isin(frequent_teams) & matches_data['team2'].isin(frequent_teams)]

In [None]:
# Handling DL method and filtering columns
# We reject matches involving DL method just to avoid confusing our model
matches_without_dl = filtered_matches_data[filtered_matches_data['dl_applied'] == 0]
matches_without_dl = matches_without_dl[['id', 'city', 'winner']]
matches_without_dl

In [None]:
matches_without_dl = matches_without_dl.merge(totalrun_df[['match_id', 'target_set']],
                       left_on='id',right_on='match_id')

matches_without_dl

In [None]:
# Merging match data with deliveries data
merged_data = matches_without_dl.merge(deliveries_data, left_on='id', right_on='match_id')
merged_data

In [None]:
merged_data.columns

In [None]:
# merged_data['total_runs_inn1'] = merged_data.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()['total_runs']

In [None]:
# Filtering second innings data
second_innings_data = merged_data[merged_data['inning'] == 2]

In [None]:
second_innings_data.head()

In [None]:
# filling nan values with "0"

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].fillna("0")

# now we will convert this player_dismissed col into a boolean col
# if the player is not dismissed then it's 0 else it's 1

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].apply(lambda x:x
                                                                      if x=="0" else "1")

# converting string to int

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].astype('int')

In [None]:
second_innings_data['player_dismissed'].unique()

In [None]:
# Calculating current score, runs left, balls left, wickets left, current run rate, and required run rate
second_innings_data['current_score'] = second_innings_data.groupby('match_id_y')['total_runs'].cumsum()
second_innings_data['runs_left'] = second_innings_data['target_set'] - second_innings_data['current_score']
second_innings_data['balls_left'] = 126 - (second_innings_data['over'] * 6 + second_innings_data['ball'])
second_innings_data['wickets_left'] = 10 - second_innings_data.groupby('match_id_y')['player_dismissed'].cumsum()
second_innings_data['cur_run_rate'] = (second_innings_data['current_score'] * 6) / (120 - second_innings_data['balls_left'])
second_innings_data['req_run_rate'] = (second_innings_data['runs_left'] * 6) / second_innings_data['balls_left']

In [None]:
second_innings_data.columns

In [None]:
# Creating result column indicating win/lose
second_innings_data.loc[:, 'result'] = second_innings_data['batting_team'] == second_innings_data['winner']
second_innings_data.loc[:, 'result'] = second_innings_data['result'].astype(int)
second_innings_data['result']

In [None]:
# Final DataFrame with features for modeling
final_data = second_innings_data[['batting_team', 'bowling_team', 'city', 'runs_left',
                                  'balls_left', 'wickets_left', 'target_set', 'cur_run_rate',
                                  'req_run_rate', 'result']]
final_data

In [None]:
# Dropping null values
final_data.dropna(inplace=True)

In [None]:
# Filtering out rows where balls_left = 0
final_data = final_data[final_data['balls_left'] != 0]

In [None]:
# Splitting data into train and test sets
X = final_data.drop('result', axis=1)
y = final_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Defining categorical columns for one-hot encoding
categorical_columns = ['batting_team', 'bowling_team', 'city']
# Defining ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='first'), categorical_columns)],
    remainder='passthrough'
)

In [None]:
# Defining Logistic Regression Pipeline
logistic_pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [None]:
# Fitting Logistic Regression model
logistic_pipeline.fit(X_train, y_train)
logistic_accuracy = logistic_pipeline.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_accuracy)

In [None]:
# Saving Logistic Regression model
pickle.dump(logistic_pipeline, open('logistic_model.pkl', 'wb'))

In [None]:
# Defining Random Forest Pipeline
random_forest_pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier())
])

In [None]:
# Fitting Random Forest model
random_forest_pipeline.fit(X_train, y_train)
random_forest_accuracy = random_forest_pipeline.score(X_test, y_test)
print("Random Forest Accuracy:", random_forest_accuracy)

In [None]:
# Saving Random Forest model
pickle.dump(random_forest_pipeline, open('random_forest_model.pkl', 'wb'))