# Virat Kohli - ODI Analysis

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder , OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor , XGBRegressor
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

### Importing Clean Data

In [3]:
# df = pd.read_excel('ViratKohli-ODI.xlsx')
df = pd.read_csv('Data-Regression.csv')

In [4]:
df.head()

Unnamed: 0,Match,Innings,Date,M/Inns,Posn,Versus,Ground,How Dismissed,Runs,B/F,S/R,Progressive-Runs,Progressive-B/F,Progressive-Avg,Progressive-S/R,Captain,Dismissal Type,City,State/Province,Country,ICC Region,Latitude,longitude,SENA
0,1,1.0,2008-08-18,1st,2,Sri Lanka,Rangiri Dambulla International Stadium,lbw b K M D N Kulasekara,12,22,54.55,12,22,12.0,54.55,No,LBW,Dambulla,Central Province,Sri Lanka,Asia,7.86,80.6736,0
1,2,2.0,2008-08-20,2nd,2,Sri Lanka,Rangiri Dambulla International Stadium,c C K Kapugedera b T Thushara,37,67,55.22,49,89,24.5,55.06,No,Caught,Dambulla,Central Province,Sri Lanka,Asia,7.86,80.6736,0
2,3,3.0,2008-08-24,1st,1,Sri Lanka,R Premadasa Stadium,run out,25,38,65.79,74,127,24.67,58.27,No,Run Out,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0
3,4,4.0,2008-08-27,1st,1,Sri Lanka,R Premadasa Stadium,b T Thushara,54,66,81.82,128,193,32.0,66.32,No,Bowled,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0
4,5,5.0,2008-08-29,2nd,1,Sri Lanka,R Premadasa Stadium,lbw b K M D N Kulasekara,31,46,67.39,159,239,31.8,66.53,No,LBW,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0


### Data Description and Information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Match             308 non-null    object        
 1   Innings           296 non-null    float64       
 2   Date              308 non-null    datetime64[ns]
 3   M/Inns            308 non-null    object        
 4   Posn              308 non-null    int64         
 5   Versus            308 non-null    object        
 6   Ground            308 non-null    object        
 7   How Dismissed     308 non-null    object        
 8   Runs              308 non-null    int64         
 9   B/F               308 non-null    int64         
 10  S/R               308 non-null    float64       
 11  Progressive-Runs  308 non-null    int64         
 12  Progressive-B/F   308 non-null    int64         
 13  Progressive-Avg   308 non-null    float64       
 14  Progressive-S/R   308 non-

In [5]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Match,308.0,154.5,89.056162,1.0,77.75,154.5,231.25,308.0
Innings,296.0,148.5,85.592056,1.0,74.75,148.5,222.25,296.0
Posn,308.0,3.061688,0.909286,0.0,3.0,3.0,3.0,7.0
Runs,308.0,47.262987,43.310924,0.0,8.75,34.0,80.25,183.0
B/F,308.0,50.464286,39.894919,0.0,13.0,43.0,83.0,159.0
S/R,308.0,77.269286,39.081306,0.0,59.905,80.95,100.0,209.09
Progressive-Runs,308.0,6910.480519,4375.897862,12.0,2997.25,6473.0,11105.0,14557.0
Progressive-B/F,308.0,7578.116883,4596.356583,22.0,3581.75,7195.0,11923.5,15543.0
Progressive-Avg,308.0,52.078214,6.768001,12.0,49.165,52.17,57.8175,60.31
Progressive-S/R,308.0,87.948377,6.428469,54.55,83.9075,89.91,92.93,93.85


### Null Values

In [6]:
df.isnull().sum()

Match                0
Innings             12
Date                 0
M/Inns               0
Posn                 0
Versus               0
Ground               0
How Dismissed        0
Runs                 0
B/F                  0
S/R                  0
Progressive-Runs     0
Progressive-B/F      0
Progressive-Avg      0
Progressive-S/R      0
Captain              0
Dismissal Type       0
City                 0
State/Province      13
Country              0
ICC Region           0
Latitude             0
longitude            0
SENA                 0
dtype: int64

### Machine Learning

In [8]:
ml = df.copy()

In [9]:
ml.head()

Unnamed: 0,Match,Innings,Date,M/Inns,Posn,Versus,Ground,How Dismissed,Runs,B/F,S/R,Progressive-Runs,Progressive-B/F,Progressive-Avg,Progressive-S/R,Captain,Dismissal Type,City,State/Province,Country,ICC Region,Latitude,longitude,SENA
0,1,1.0,2008-08-18,1st,2,Sri Lanka,Rangiri Dambulla International Stadium,lbw b K M D N Kulasekara,12,22,54.55,12,22,12.0,54.55,No,LBW,Dambulla,Central Province,Sri Lanka,Asia,7.86,80.6736,0
1,2,2.0,2008-08-20,2nd,2,Sri Lanka,Rangiri Dambulla International Stadium,c C K Kapugedera b T Thushara,37,67,55.22,49,89,24.5,55.06,No,Caught,Dambulla,Central Province,Sri Lanka,Asia,7.86,80.6736,0
2,3,3.0,2008-08-24,1st,1,Sri Lanka,R Premadasa Stadium,run out,25,38,65.79,74,127,24.67,58.27,No,Run Out,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0
3,4,4.0,2008-08-27,1st,1,Sri Lanka,R Premadasa Stadium,b T Thushara,54,66,81.82,128,193,32.0,66.32,No,Bowled,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0
4,5,5.0,2008-08-29,2nd,1,Sri Lanka,R Premadasa Stadium,lbw b K M D N Kulasekara,31,46,67.39,159,239,31.8,66.53,No,LBW,Colombo,Western Province,Sri Lanka,Asia,6.9322,79.8726,0


In [10]:
ml = ml[['Match','Date','M/Inns','Versus','Country','SENA','Captain','B/F','Runs']]

In [11]:
ml.head()

Unnamed: 0,Match,Date,M/Inns,Versus,Country,SENA,Captain,B/F,Runs
0,1,2008-08-18,1st,Sri Lanka,Sri Lanka,0,No,22,12
1,2,2008-08-20,2nd,Sri Lanka,Sri Lanka,0,No,67,37
2,3,2008-08-24,1st,Sri Lanka,Sri Lanka,0,No,38,25
3,4,2008-08-27,1st,Sri Lanka,Sri Lanka,0,No,66,54
4,5,2008-08-29,2nd,Sri Lanka,Sri Lanka,0,No,46,31


In [12]:
ml['Country'].fillna('India',axis=0,inplace=True)

In [13]:
ml['Date'] = pd.to_datetime(ml['Date'])

In [14]:
ml['Month'] = ml['Date'].dt.month

In [15]:
ml['Year'] = ml['Date'].dt.year

In [16]:
ml.drop('Date',inplace=True,axis=1)

### Preprocessing

In [17]:
ml['M/Inns'] = ml['M/Inns'].map({'1st':1, '2nd':2,'N/A - No Result':0})

In [18]:
ml['M/Inns'].value_counts()

M/Inns
2    169
1    135
0      4
Name: count, dtype: int64

In [19]:
ml['Captain'] = ml['Captain'].map({'Yes':1,'No':0})

In [20]:
le_versus = LabelEncoder()
le_country = LabelEncoder()

In [21]:
ml['Versus'] = le_versus.fit_transform(ml['Versus'])
ml['Country'] = le_country.fit_transform(ml['Country'])

In [22]:
ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Match    308 non-null    int64
 1   M/Inns   308 non-null    int64
 2   Versus   308 non-null    int32
 3   Country  308 non-null    int32
 4   SENA     308 non-null    int64
 5   Captain  308 non-null    int64
 6   B/F      308 non-null    int64
 7   Runs     308 non-null    int64
 8   Month    308 non-null    int32
 9   Year     308 non-null    int32
dtypes: int32(4), int64(6)
memory usage: 19.4 KB


In [23]:
ml.head()

Unnamed: 0,Match,M/Inns,Versus,Country,SENA,Captain,B/F,Runs,Month,Year
0,1,1,10,10,0,0,22,12,8,2008
1,2,2,10,10,0,0,67,37,8,2008
2,3,1,10,10,0,0,38,25,8,2008
3,4,1,10,10,0,0,66,54,8,2008
4,5,2,10,10,0,0,46,31,8,2008


In [24]:
x = ml.drop('Runs',axis=1)
y = ml[['Runs']]

In [25]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.05,shuffle=False)

In [26]:
lr = LinearRegression()
kn = KNeighborsRegressor()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
ada = AdaBoostRegressor()
gb = GradientBoostingRegressor()
xgb = XGBRegressor()
xgbrf = XGBRFRegressor()
cb = CatBoostRegressor()
lgb = LGBMRegressor()

In [27]:
xtrain.tail()

Unnamed: 0,Match,M/Inns,Versus,Country,SENA,Captain,B/F,Month,Year
287,288,1,10,6,0,0,94,11,2023
288,289,1,9,6,0,0,121,11,2023
289,290,1,6,6,0,0,56,11,2023
290,291,1,7,6,0,0,113,11,2023
291,292,1,1,6,0,0,63,11,2023


In [28]:
lr.fit(xtrain,ytrain)
lr_preds = lr.predict(xtest)
lr_r2 = r2_score(ytest,lr_preds)
lr_mae = mean_absolute_error(ytest,lr_preds)
lr_mse = mean_squared_error(ytest,lr_preds)
lr_rmse = np.sqrt(lr_mse)

print('r2 Score: ',lr_r2)
print('MAE: ',lr_mae)
print('RMSE: ',lr_rmse)

r2 Score:  0.9539172067950424
MAE:  6.710447967373824
RMSE:  9.056170885552273


### Decision Tree

In [29]:
dt.fit(xtrain,ytrain)
dt_preds = dt.predict(xtest)
dt_r2 = r2_score(ytest,dt_preds)
dt_mae = mean_absolute_error(ytest,dt_preds)
dt_mse = mean_squared_error(ytest,dt_preds)
dt_rmse = np.sqrt(dt_mse)

print('r2 Score: ',dt_r2)
print('MAE: ',dt_mae)
print('RMSE: ',dt_rmse)

r2 Score:  0.8649011099478279
MAE:  12.3125
RMSE:  15.506047207460707


### KNN

In [30]:
kn.fit(xtrain,ytrain)
kn_preds = kn.predict(xtest)
kn_r2 = r2_score(ytest,kn_preds)
kn_mae = mean_absolute_error(ytest,kn_preds)
kn_mse = mean_squared_error(ytest,kn_preds)
kn_rmse = np.sqrt(kn_mse)

print('r2 Score: ',kn_r2)
print('MAE: ',kn_mae)
print('RMSE: ',kn_rmse)

r2 Score:  0.8933199884988597
MAE:  9.475
RMSE:  13.778969482512109




### Random Forest

In [31]:
rf.fit(xtrain,ytrain)
rf_preds = rf.predict(xtest)
rf_r2 = r2_score(ytest,rf_preds)
rf_mae = mean_absolute_error(ytest,rf_preds)
rf_mse = mean_squared_error(ytest,rf_preds)
rf_rmse = np.sqrt(rf_mse)

print('r2 Score: ',rf_r2)
print('MAE: ',rf_mae)
print('RMSE: ',rf_rmse)

r2 Score:  0.9191382292194809
MAE:  7.789375
RMSE:  11.996286665047647


  rf.fit(xtrain,ytrain)


### Gradient Boosting

In [32]:
gb.fit(xtrain,ytrain)
gb_preds = gb.predict(xtest)
gb_r2 = r2_score(ytest,gb_preds)
gb_mae = mean_absolute_error(ytest,gb_preds)
gb_mse = mean_squared_error(ytest,gb_preds)
gb_rmse = np.sqrt(gb_mse)

print('r2 Score: ',gb_r2)
print('MAE: ',gb_mae)
print('RMSE: ',gb_rmse)

r2 Score:  0.9112667310640246
MAE:  9.506960123408943
RMSE:  12.566619110159118


  y = column_or_1d(y, warn=True)


### Adaboost

In [33]:
ada.fit(xtrain,ytrain)
ada_preds = ada.predict(xtest)
ada_r2 = r2_score(ytest,ada_preds)
ada_mae = mean_absolute_error(ytest,ada_preds)
ada_mse = mean_squared_error(ytest,ada_preds)
ada_rmse = np.sqrt(ada_mse)

print('r2 Score: ',ada_r2)
print('MAE: ',ada_mae)
print('RMSE: ',ada_rmse)

r2 Score:  0.9315385642402844
MAE:  8.044973843440836
RMSE:  11.038198831603053


  y = column_or_1d(y, warn=True)


### Catboost

In [34]:
# cb.fit(xtrain,ytrain)
# cb_preds = cb.predict(xtest)
# cb_r2 = r2_score(ytest,cb_preds)
# cb_mae = mean_absolute_error(ytest,cb_preds)
# cb_mse = mean_squared_error(ytest,cb_preds)
# cb_rmse = np.sqrt(cb_mse)

# print('r2 Score: ',cb_r2)
# print('MAE: ',cb_mae)
# print('RMSE: ',cb_rmse)

### LightGBM

In [35]:
lgb.fit(xtrain,ytrain)
lgb_preds = lgb.predict(xtest)
lgb_r2 = r2_score(ytest,lgb_preds)
lgb_mae = mean_absolute_error(ytest,lgb_preds)
lgb_mse = mean_squared_error(ytest,lgb_preds)
lgb_rmse = np.sqrt(lgb_mse)

print('r2 Score: ',lgb_r2)
print('MAE: ',lgb_mae)
print('RMSE: ',lgb_rmse)

r2 Score:  0.9221247846617131
MAE:  8.044934496035504
RMSE:  11.77266650754097


### Predictions

In [37]:
lrpreds = []
for i in lr_preds:
    lrpreds.append(i[0])

In [38]:
ytest['Preds'] = np.absolute(lrpreds)

In [40]:
ytest['Preds'] = ytest['Preds'].astype('int64')

In [44]:
ytest['Diffrence'] = ytest['Preds'] - ytest['Runs']

### Post Processing

In [47]:
xtest['Versus'] = le_versus.inverse_transform(xtest['Versus'])
xtest['Country'] = le_country.inverse_transform(xtest['Country'])

In [52]:
xtest['Actual Score'] = ytest['Runs']
xtest['Predicted Score'] = ytest['Preds']

In [54]:
xtest['Prediction Error'] = ytest['Diffrence']

In [55]:
xtest

Unnamed: 0,Match,M/Inns,Versus,Country,SENA,Captain,B/F,Month,Year,Actual Score,Predicted Score,Prediction Error
292,293,2,Sri Lanka,Sri Lanka,0,0,32,8,2024,24,29,5
293,294,2,Sri Lanka,Sri Lanka,0,0,19,8,2024,14,16,2
294,295,2,Sri Lanka,Sri Lanka,0,0,18,8,2024,20,15,-5
295,296,2,England,India,0,0,8,2,2025,5,4,-1
296,297,1,England,India,0,0,55,2,2025,52,53,1
297,298,2,Bangladesh,UAE,0,0,38,2,2025,22,35,13
298,299,2,Pakistan,UAE,0,0,111,2,2025,100,110,10
299,300,1,New Zealand,UAE,0,0,14,3,2025,11,10,-1
300,301,2,Australia,UAE,0,0,98,3,2025,84,97,13
301,302,2,New Zealand,UAE,0,0,2,3,2025,1,1,0
