### Libraries

In [148]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier,XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

### Importing the data

In [90]:
df = pd.read_csv('clean_data.csv')

In [91]:
df.head()

Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,,Asad Rauf,RE Koertzen
1,335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,,MR Benson,SL Shastri
2,335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,N,,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,,SJ Davis,DJ Harper
4,335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,N,,BF Bowden,K Hariharan


### info/descriptions on the data

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812 entries, 0 to 811
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               812 non-null    int64  
 1   city             812 non-null    object 
 2   date             812 non-null    object 
 3   player_of_match  812 non-null    object 
 4   venue            812 non-null    object 
 5   neutral_venue    812 non-null    int64  
 6   team1            812 non-null    object 
 7   team2            812 non-null    object 
 8   toss_winner      812 non-null    object 
 9   toss_decision    812 non-null    object 
 10  winner           812 non-null    object 
 11  result           812 non-null    object 
 12  result_margin    799 non-null    float64
 13  eliminator       812 non-null    object 
 14  method           19 non-null     object 
 15  umpire1          812 non-null    object 
 16  umpire2          812 non-null    object 
dtypes: float64(1), i

In [93]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,812.0,755963.018473,306136.343714,335982.0,501226.75,729296.0,1082626.25,1237181.0
neutral_venue,812.0,0.094828,0.293157,0.0,0.0,0.0,0.0,1.0
result_margin,799.0,17.321652,22.068427,1.0,6.0,8.0,19.5,146.0


### Null values

In [94]:
df.isnull().sum()

id                   0
city                 0
date                 0
player_of_match      0
venue                0
neutral_venue        0
team1                0
team2                0
toss_winner          0
toss_decision        0
winner               0
result               0
result_margin       13
eliminator           0
method             793
umpire1              0
umpire2              0
dtype: int64

### Lets now analyse our null data and drop the unnecesary features

In [111]:
df1 = df.copy()

In [112]:
df1 = df1[['id','date','city','venue','neutral_venue','team1','team2','toss_winner','toss_decision','winner',
        'result','result_margin','eliminator','umpire1','umpire2']]

In [113]:
# df1.head()

In [114]:
df1.fillna(0,axis=0,inplace=True)

In [115]:
df1.isnull().sum()

id               0
date             0
city             0
venue            0
neutral_venue    0
team1            0
team2            0
toss_winner      0
toss_decision    0
winner           0
result           0
result_margin    0
eliminator       0
umpire1          0
umpire2          0
dtype: int64

### Understanding the data

In [116]:
for i in df1.columns:
    print('"',i,'" unique values : ',df1[i].nunique())
    print(df1[i].unique())
    print('\n')

" id " unique values :  812
[ 335982  335983  335984  335985  335986  335987  335988  335989  335990
  335991  335992  335993  335994  335995  335996  335997  335998  335999
  336000  336001  336002  336003  336004  336005  336006  336007  336008
  336009  336010  336011  336012  336013  336014  336015  336016  336017
  336018  336019  336020  336021  336022  336023  336024  336025  336026
  336027  336028  336029  336031  336032  336033  336034  336035  336036
  336037  336038  336039  336040  392181  392182  392183  392184  392185
  392186  392188  392189  392190  392191  392192  392194  392195  392196
  392197  392198  392199  392200  392201  392202  392203  392204  392205
  392206  392207  392208  392209  392210  392211  392212  392213  392214
  392215  392216  392217  392218  392219  392220  392221  392222  392223
  392224  392225  392226  392227  392228  392229  392230  392231  392232
  392233  392234  392235  392236  392237  392238  392239  419106  419107
  419108  419109  41911

### We can see that the teams that have been renamed this season have not been updated! lets now update all of them! 

### Creating new features year and month

In [117]:
year = []
for i in df1.date:
    year.append(i[:4])

In [118]:
df1['year'] = year
col = df1.pop('year')
df1.insert(3,'year',col)

In [119]:
mth = []
for i in df1.date:
    mth.append(i.split('-')[1])
df1['month'] = mth

In [120]:
df1['month'] = df1['month'].astype(int)

In [121]:
col = df1.pop('month')
df1.insert(4,'month',col)

In [126]:
df1.team1.unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals'], dtype=object)

In [127]:
renamed_teams = []
for i in df1.team1:
    if i == 'Delhi Daredevils':
        renamed_teams.append('Delhi Capitals')
    elif i == 'Deccan Chargers':
        renamed_teams.append('Sunrisers Hyderabad')
    elif i == 'Rising Pune Supergiant':
        renamed_teams.append('Rising Pune Supergiants')
    elif i == 'Pune Warriors':
        renamed_teams.append('Rising Pune Supergiants')
    else :
        renamed_teams.append(i)
df1['team1'] = renamed_teams

In [128]:
renamed_teams = []
for i in df1.team2:
    if i == 'Delhi Daredevils':
        renamed_teams.append('Delhi Capitals')
    elif i == 'Deccan Chargers':
        renamed_teams.append('Sunrisers Hyderabad')
    elif i == 'Rising Pune Supergiant':
        renamed_teams.append('Rising Pune Supergiants')
    elif i == 'Pune Warriors':
        renamed_teams.append('Rising Pune Supergiants')
    else :
        renamed_teams.append(i)
df1['team2'] = renamed_teams

In [129]:
renamed_teams = []
for i in df1.toss_winner:
    if i == 'Delhi Daredevils':
        renamed_teams.append('Delhi Capitals')
    elif i == 'Deccan Chargers':
        renamed_teams.append('Sunrisers Hyderabad')
    elif i == 'Rising Pune Supergiant':
        renamed_teams.append('Rising Pune Supergiants')
    elif i == 'Pune Warriors':
        renamed_teams.append('Rising Pune Supergiants')
    else :
        renamed_teams.append(i)
df1['toss_winner'] = renamed_teams

In [130]:
renamed_teams = []
for i in df1.winner:
    if i == 'Delhi Daredevils':
        renamed_teams.append('Delhi Capitals')
    elif i == 'Deccan Chargers':
        renamed_teams.append('Sunrisers Hyderabad')
    elif i == 'Rising Pune Supergiant':
        renamed_teams.append('Rising Pune Supergiants')
    elif i == 'Pune Warriors':
        renamed_teams.append('Rising Pune Supergiants')
    else :
        renamed_teams.append(i)
df1['winner'] = renamed_teams

In [173]:
renamed_ven = []
for i in df1.city:
    if i == 'Bengaluru':
        renamed_ven.append('Bangalore')
    else:
        renamed_ven.append(i)
df1['city'] = renamed_ven

In [160]:
df1.loc[(df1.toss_winner == 'Chennai Super Kings') & (df1.winner == 'Rising Pune Supergiants')]

Unnamed: 0,id,date,city,year,month,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,umpire1,umpire2
263,548322,2012-04-14,Pune,2012,4,Subrata Roy Sahara Stadium,0,Rising Pune Supergiants,Chennai Super Kings,Chennai Super Kings,bat,Rising Pune Supergiants,wickets,7.0,N,Aleem Dar,BNJ Oxenford


### Lets now analyse the data

### Win the toss and win the match ?  

In [138]:
tw_winner = pd.DataFrame(df1.winner.groupby(df1.toss_winner).value_counts())
tw_winner['percentage'] = round(pd.DataFrame(df1.winner.groupby(df1.toss_winner).value_counts(normalize=True))*100,2)

In [162]:
op = pd.DataFrame(df1.toss_winner.groupby(df1.winner).value_counts())
op['percentage'] = round(pd.DataFrame(df1.toss_winner.groupby(df1.winner).value_counts(normalize=True))*100,2)

In [164]:
tw_winner.rename(columns={'winner':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
toss_winner,winner,Unnamed: 2_level_1,Unnamed: 3_level_1
Chennai Super Kings,Chennai Super Kings,61,62.89
Chennai Super Kings,Mumbai Indians,7,7.22
Chennai Super Kings,Delhi Capitals,6,6.19
Chennai Super Kings,Kings XI Punjab,5,5.15
Chennai Super Kings,Rajasthan Royals,5,5.15
Chennai Super Kings,Royal Challengers Bangalore,5,5.15
Chennai Super Kings,Kolkata Knight Riders,4,4.12
Chennai Super Kings,Sunrisers Hyderabad,3,3.09
Chennai Super Kings,Rising Pune Supergiants,1,1.03
Delhi Capitals,Delhi Capitals,45,45.45


### We can see that Gujarat Lions have the highest win percentage when they win the toss! CSK is the 2nd while being the highest active team

### Venues and results

In [174]:
ven_winner = pd.DataFrame(df1.winner.groupby(df1.city).value_counts())
ven_winner['percentage'] = round(pd.DataFrame(df1.winner.groupby(df1.city).value_counts(normalize=True))*100,2)

In [175]:
ven_winner

Unnamed: 0_level_0,Unnamed: 1_level_0,winner,percentage
city,winner,Unnamed: 2_level_1,Unnamed: 3_level_1
Abu Dhabi,Kolkata Knight Riders,6,20.69
Abu Dhabi,Mumbai Indians,6,20.69
Abu Dhabi,Rajasthan Royals,6,20.69
Abu Dhabi,Chennai Super Kings,3,10.34
Abu Dhabi,Delhi Capitals,2,6.9
Abu Dhabi,Kings XI Punjab,2,6.9
Abu Dhabi,Royal Challengers Bangalore,2,6.9
Abu Dhabi,Sunrisers Hyderabad,2,6.9
Ahmedabad,Rajasthan Royals,7,58.33
Ahmedabad,Delhi Capitals,1,8.33


### We can see that CSK have the highest win% at home with 70% 

In [190]:
# csk = df1.loc[(df1.team1 == 'Chennai Super Kings') | (df1.team2 == 'Chennai Super Kings')]

In [197]:
td_winner = pd.DataFrame(df1.toss_decision.groupby(df1.winner).value_counts())
td_winner['percentage'] = round(pd.DataFrame(df1.toss_decision.groupby(df1.winner).value_counts(normalize=True))*100,2)

In [198]:
td_winner

Unnamed: 0_level_0,Unnamed: 1_level_0,toss_decision,percentage
winner,toss_decision,Unnamed: 2_level_1,Unnamed: 3_level_1
Chennai Super Kings,bat,55,51.89
Chennai Super Kings,field,51,48.11
Delhi Capitals,field,52,60.47
Delhi Capitals,bat,34,39.53
Gujarat Lions,field,11,84.62
Gujarat Lions,bat,2,15.38
Kings XI Punjab,field,64,72.73
Kings XI Punjab,bat,24,27.27
Kochi Tuskers Kerala,field,6,100.0
Kolkata Knight Riders,field,61,61.62


In [208]:
res_winner = pd.DataFrame(df1.result.groupby(df1.winner).value_counts(normalize=False))
res_winner['percentage'] = round(pd.DataFrame(df1.result.groupby(df1.winner).value_counts(normalize=True))*100,2)

In [209]:
res_winner

Unnamed: 0_level_0,Unnamed: 1_level_0,result,percentage
winner,result,Unnamed: 2_level_1,Unnamed: 3_level_1
Chennai Super Kings,runs,53,50.0
Chennai Super Kings,wickets,53,50.0
Delhi Capitals,wickets,50,58.14
Delhi Capitals,runs,34,39.53
Delhi Capitals,tie,2,2.33
Gujarat Lions,wickets,12,92.31
Gujarat Lions,runs,1,7.69
Kings XI Punjab,wickets,45,51.14
Kings XI Punjab,runs,40,45.45
Kings XI Punjab,tie,3,3.41


In [214]:
nut_winner = pd.DataFrame(df1.neutral_venue.groupby(df1.winner).value_counts())
nut_winner['percentage'] = round(pd.DataFrame(df1.neutral_venue.groupby(df1.winner).value_counts(normalize=True))*100,2)

In [218]:
nut_winner

Unnamed: 0_level_0,Unnamed: 1_level_0,neutral_venue,percentage
winner,neutral_venue,Unnamed: 2_level_1,Unnamed: 3_level_1
Chennai Super Kings,0,94,88.68
Chennai Super Kings,1,12,11.32
Delhi Capitals,0,74,86.05
Delhi Capitals,1,12,13.95
Gujarat Lions,0,13,100.0
Kings XI Punjab,0,76,86.36
Kings XI Punjab,1,12,13.64
Kochi Tuskers Kerala,0,6,100.0
Kolkata Knight Riders,0,94,94.95
Kolkata Knight Riders,1,5,5.05


In [165]:
df1.head()

Unnamed: 0,id,date,city,year,month,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,umpire1,umpire2
0,335982,2008-04-18,Bangalore,2008,4,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,Asad Rauf,RE Koertzen
1,335983,2008-04-19,Chandigarh,2008,4,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,MR Benson,SL Shastri
2,335984,2008-04-19,Delhi,2008,4,Feroz Shah Kotla,0,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,bat,Delhi Capitals,wickets,9.0,N,Aleem Dar,GA Pratapkumar
3,335985,2008-04-20,Mumbai,2008,4,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,SJ Davis,DJ Harper
4,335986,2008-04-20,Kolkata,2008,4,Eden Gardens,0,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Kolkata Knight Riders,wickets,5.0,N,BF Bowden,K Hariharan
