In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle

In [2]:
# Loading data
matches_data = pd.read_csv('matches.csv')
deliveries_data = pd.read_csv('deliveries.csv')

In [3]:
totalrun_df = deliveries_data.groupby(['match_id','inning']).sum()['total_runs'].reset_index()

totalrun_df.head()

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
1,1,2,172
2,2,1,184
3,2,2,187
4,3,1,183


In [4]:
totalrun_df = totalrun_df[totalrun_df['inning']==1]
totalrun_df['target_set'] = totalrun_df['total_runs'].apply(lambda x:x+1)
totalrun_df

Unnamed: 0,match_id,inning,total_runs,target_set
0,1,1,207,208
2,2,1,184,185
4,3,1,183,184
6,4,1,163,164
8,5,1,157,158
...,...,...,...,...
1518,11347,1,143,144
1520,11412,1,136,137
1522,11413,1,171,172
1524,11414,1,155,156


In [5]:
# Replacing old team names with new ones
teams_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad'
}
matches_data.replace({'team1': teams_mapping, 'team2': teams_mapping}, inplace=True)

In [6]:
# Filtering frequently occurring teams
# Excluding teams like Kochi Tuskers, Pune Warriors, etc.
frequent_teams = [
    'Sunrisers Hyderabad', 'Mumbai Indians', 'Royal Challengers Bangalore',
    'Kolkata Knight Riders', 'Kings XI Punjab', 'Chennai Super Kings',
    'Rajasthan Royals', 'Delhi Capitals'
]
filtered_matches_data = matches_data[matches_data['team1'].isin(frequent_teams) & matches_data['team2'].isin(frequent_teams)]

In [7]:
# Handling DL method and filtering columns
# We reject matches involving DL method just to avoid confusing our model
matches_without_dl = filtered_matches_data[filtered_matches_data['dl_applied'] == 0]
matches_without_dl = matches_without_dl[['id', 'city', 'winner']]
matches_without_dl

Unnamed: 0,id,city,winner
0,1,Hyderabad,Sunrisers Hyderabad
4,5,Bangalore,Royal Challengers Bangalore
6,7,Mumbai,Mumbai Indians
7,8,Indore,Kings XI Punjab
9,10,Mumbai,Mumbai Indians
...,...,...,...
751,11347,Mumbai,Mumbai Indians
752,11412,Chennai,Mumbai Indians
753,11413,Visakhapatnam,Delhi Capitals
754,11414,Visakhapatnam,Chennai Super Kings


In [8]:
matches_without_dl = matches_without_dl.merge(totalrun_df[['match_id', 'target_set']],
                       left_on='id',right_on='match_id')

matches_without_dl

Unnamed: 0,id,city,winner,match_id,target_set
0,1,Hyderabad,Sunrisers Hyderabad,1,208
1,5,Bangalore,Royal Challengers Bangalore,5,158
2,7,Mumbai,Mumbai Indians,7,179
3,8,Indore,Kings XI Punjab,8,149
4,10,Mumbai,Mumbai Indians,10,159
...,...,...,...,...,...
621,11347,Mumbai,Mumbai Indians,11347,144
622,11412,Chennai,Mumbai Indians,11412,137
623,11413,Visakhapatnam,Delhi Capitals,11413,172
624,11414,Visakhapatnam,Chennai Super Kings,11414,156


In [9]:
# Merging match data with deliveries data
merged_data = matches_without_dl.merge(deliveries_data, left_on='id', right_on='match_id')
merged_data

Unnamed: 0,id,city,winner,match_id_x,target_set,match_id_y,inning,batting_team,bowling_team,over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,Hyderabad,Sunrisers Hyderabad,1,208,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,...,0,0,0,0,0,0,0,,,
1,1,Hyderabad,Sunrisers Hyderabad,1,208,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,...,0,0,0,0,0,0,0,,,
2,1,Hyderabad,Sunrisers Hyderabad,1,208,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,...,0,0,0,0,4,0,4,,,
3,1,Hyderabad,Sunrisers Hyderabad,1,208,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,...,0,0,0,0,0,0,0,,,
4,1,Hyderabad,Sunrisers Hyderabad,1,208,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,...,0,0,0,0,0,2,2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,Hyderabad,Mumbai Indians,11415,153,11415,2,Chennai Super Kings,Mumbai Indians,20,...,0,0,0,0,1,0,1,,,
149574,11415,Hyderabad,Mumbai Indians,11415,153,11415,2,Chennai Super Kings,Mumbai Indians,20,...,0,0,0,0,2,0,2,,,
149575,11415,Hyderabad,Mumbai Indians,11415,153,11415,2,Chennai Super Kings,Mumbai Indians,20,...,0,0,0,0,1,0,1,SR Watson,run out,KH Pandya
149576,11415,Hyderabad,Mumbai Indians,11415,153,11415,2,Chennai Super Kings,Mumbai Indians,20,...,0,0,0,0,2,0,2,,,


In [11]:
merged_data.columns

Index(['id', 'city', 'winner', 'match_id_x', 'target_set', 'match_id_y',
       'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batsman',
       'non_striker', 'bowler', 'is_super_over', 'wide_runs', 'bye_runs',
       'legbye_runs', 'noball_runs', 'penalty_runs', 'batsman_runs',
       'extra_runs', 'total_runs', 'player_dismissed', 'dismissal_kind',
       'fielder'],
      dtype='object')

In [26]:
# merged_data['total_runs_inn1'] = merged_data.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()['total_runs']

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
1,1,2,172
2,5,1,157
3,5,2,142
4,7,1,178
...,...,...,...
1262,11413,2,170
1263,11414,1,155
1264,11414,2,162
1265,11415,1,152


In [12]:
# Filtering second innings data
second_innings_data = merged_data[merged_data['inning'] == 2]

In [14]:
second_innings_data.head()

Unnamed: 0,id,city,winner,match_id_x,target_set,match_id_y,inning,batting_team,bowling_team,over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
125,1,Hyderabad,Sunrisers Hyderabad,1,208,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,...,0,0,0,0,1,0,1,,,
126,1,Hyderabad,Sunrisers Hyderabad,1,208,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,...,0,0,0,0,0,0,0,,,
127,1,Hyderabad,Sunrisers Hyderabad,1,208,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,...,0,0,0,0,0,0,0,,,
128,1,Hyderabad,Sunrisers Hyderabad,1,208,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,...,0,0,0,0,2,0,2,,,
129,1,Hyderabad,Sunrisers Hyderabad,1,208,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,...,0,0,0,0,4,0,4,,,


In [19]:
# filling nan values with "0"

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].fillna("0")

# now we will convert this player_dismissed col into a boolean col
# if the player is not dismissed then it's 0 else it's 1

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].apply(lambda x:x
                                                                      if x=="0" else "1")

# converting string to int

second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].fillna("0")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_innings_data['player_dismissed'] = second_innings_data['player_dismissed'].apply(lambda x:x
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_innin

In [21]:
second_innings_data['player_dismissed'].unique()

array([0, 1])

In [22]:
# Calculating current score, runs left, balls left, wickets left, current run rate, and required run rate
second_innings_data['current_score'] = second_innings_data.groupby('match_id_y')['total_runs'].cumsum()
second_innings_data['runs_left'] = second_innings_data['target_set'] - second_innings_data['current_score']
second_innings_data['balls_left'] = 126 - (second_innings_data['over'] * 6 + second_innings_data['ball'])
second_innings_data['wickets_left'] = 10 - second_innings_data.groupby('match_id_y')['player_dismissed'].cumsum()
second_innings_data['cur_run_rate'] = (second_innings_data['current_score'] * 6) / (120 - second_innings_data['balls_left'])
second_innings_data['req_run_rate'] = (second_innings_data['runs_left'] * 6) / second_innings_data['balls_left']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_innings_data['current_score'] = second_innings_data.groupby('match_id_y')['total_runs'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_innings_data['runs_left'] = second_innings_data['target_set'] - second_innings_data['current_score']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

In [27]:
second_innings_data.columns

Index(['id', 'city', 'winner', 'match_id_x', 'target_set', 'match_id_y',
       'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batsman',
       'non_striker', 'bowler', 'is_super_over', 'wide_runs', 'bye_runs',
       'legbye_runs', 'noball_runs', 'penalty_runs', 'batsman_runs',
       'extra_runs', 'total_runs', 'player_dismissed', 'dismissal_kind',
       'fielder', 'current_score', 'runs_left', 'balls_left', 'wickets_left',
       'cur_run_rate', 'req_run_rate', 'result'],
      dtype='object')

In [26]:
# Creating result column indicating win/lose
second_innings_data.loc[:, 'result'] = second_innings_data['batting_team'] == second_innings_data['winner']
second_innings_data.loc[:, 'result'] = second_innings_data['result'].astype(int)
second_innings_data['result']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


125       0
126       0
127       0
128       0
129       0
         ..
149573    0
149574    0
149575    0
149576    0
149577    0
Name: result, Length: 72413, dtype: int64

In [28]:
# Final DataFrame with features for modeling
final_data = second_innings_data[['batting_team', 'bowling_team', 'city', 'runs_left',
                                  'balls_left', 'wickets_left', 'target_set', 'cur_run_rate',
                                  'req_run_rate', 'result']]
final_data

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets_left,target_set,cur_run_rate,req_run_rate,result
125,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,119,10,208,6.000000,10.436975,0
126,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,118,10,208,3.000000,10.525424,0
127,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,117,10,208,2.000000,10.615385,0
128,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,205,116,10,208,4.500000,10.603448,0
129,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,201,115,10,208,8.400000,10.486957,0
...,...,...,...,...,...,...,...,...,...,...
149573,Chennai Super Kings,Mumbai Indians,Hyderabad,1,4,5,153,7.862069,1.500000,0
149574,Chennai Super Kings,Mumbai Indians,Hyderabad,-1,3,5,153,7.897436,-2.000000,0
149575,Chennai Super Kings,Mumbai Indians,Hyderabad,-2,2,4,153,7.881356,-6.000000,0
149576,Chennai Super Kings,Mumbai Indians,Hyderabad,-4,1,4,153,7.915966,-24.000000,0


In [29]:
# Dropping null values
final_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.dropna(inplace=True)


In [30]:
# Filtering out rows where balls_left = 0
final_data = final_data[final_data['balls_left'] != 0]

In [31]:
# Splitting data into train and test sets
X = final_data.drop('result', axis=1)
y = final_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [32]:
# Defining categorical columns for one-hot encoding
categorical_columns = ['batting_team', 'bowling_team', 'city']
# Defining ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='first'), categorical_columns)],
    remainder='passthrough'
)

In [33]:
# Defining Logistic Regression Pipeline
logistic_pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [34]:
# Fitting Logistic Regression model
logistic_pipeline.fit(X_train, y_train)
logistic_accuracy = logistic_pipeline.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_accuracy)

Logistic Regression Accuracy: 0.8065036092227906


In [35]:
# Saving Logistic Regression model
pickle.dump(logistic_pipeline, open('logistic_model.pkl', 'wb'))

In [36]:
# Defining Random Forest Pipeline
random_forest_pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier())
])

In [37]:
# Fitting Random Forest model
random_forest_pipeline.fit(X_train, y_train)
random_forest_accuracy = random_forest_pipeline.score(X_test, y_test)
print("Random Forest Accuracy:", random_forest_accuracy)

Random Forest Accuracy: 0.9990889340528418


In [38]:
# Saving Random Forest model
pickle.dump(random_forest_pipeline, open('random_forest_model.pkl', 'wb'))