# IPL Score Prediction

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
#importing the dataset
df=pd.read_csv('ipl2017.csv') 

In [3]:
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [4]:
#checking null values
df.isnull().values.any()

False

In [5]:
df.columns

Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman', 'bowler',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker',
       'non-striker', 'total'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   bat_team        76014 non-null  object 
 4   bowl_team       76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


In [7]:
df.shape

(76014, 15)

<b> Data Cleaning

Removing unwanted columns

In [8]:
df=df.drop(["mid" , "date" , "venue" , "batsman" , "bowler" , "striker" , "non-striker"] , axis=1 )

In [9]:
df.columns

Index(['bat_team', 'bowl_team', 'runs', 'wickets', 'overs', 'runs_last_5',
       'wickets_last_5', 'total'],
      dtype='object')

Converting categorical columns to numerical columns

In [10]:
df['bat_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant'], dtype=object)

In [11]:
consistent_teams = ['Kolkata Knight Riders' , 'Chennai Super Kings' , 'Rajasthan Royals' , 
                    'Mumbai Indians' , 'Kings XI Punjab' , 'Royal Challengers Bangalore' ,
                    'Delhi Daredevils' , 'Sunrisers Hyderabad']

In [12]:
# Keeping only consistent teams
print('Before removing inconsistent teams: {}'.format(df.shape))
df = df[(df['bat_team'].isin(consistent_teams)) & (df['bowl_team'].isin(consistent_teams))]
print('After removing inconsistent teams: {}'.format(df.shape))

Before removing inconsistent teams: (76014, 8)
After removing inconsistent teams: (53811, 8)


In [13]:
df.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,Kolkata Knight Riders,Royal Challengers Bangalore,1,0,0.1,1,0,222
1,Kolkata Knight Riders,Royal Challengers Bangalore,1,0,0.2,1,0,222
2,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.2,2,0,222
3,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.3,2,0,222
4,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.4,2,0,222


In [14]:
#df=pd.get_dummies(df,columns=["bat_team" , "bowl_team"] , drop_first=True)

In [15]:
#df

In [16]:
df.replace(['Kolkata Knight Riders' , 'Chennai Super Kings', 'Rajasthan Royals', 'Mumbai Indians', 
            'Kings XI Punjab', 'Royal Challengers Bangalore', 'Delhi Daredevils', 'Sunrisers Hyderabad']
            ,['KKR','CSK','RR','MI','KXIP','RCB','DD','SRH'],inplace=True)


In [17]:
encode = {'bat_team': {'KKR':1,'CSK':2,'RR':3,'MI':4,'KXIP':5,'RCB':6,'DD':7,'SRH':8},
          'bowl_team': {'KKR':1,'CSK':2,'RR':3,'MI':4,'KXIP':5,'RCB':6,'DD':7,'SRH':8}}
df.replace(encode, inplace=True)

In [18]:
df.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,1,6,1,0,0.1,1,0,222
1,1,6,1,0,0.2,1,0,222
2,1,6,2,0,0.2,2,0,222
3,1,6,2,0,0.3,2,0,222
4,1,6,2,0,0.4,2,0,222


### Modelling

In [19]:
y=df["total"]

In [20]:
X=df.drop("total" , axis=1)

In [21]:
X.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5
0,1,6,1,0,0.1,1,0
1,1,6,1,0,0.2,1,0
2,1,6,2,0,0.2,2,0
3,1,6,2,0,0.3,2,0
4,1,6,2,0,0.4,2,0


In [22]:
type(X)

pandas.core.frame.DataFrame

In [23]:
X.shape

(53811, 7)

<b> Train-Test-Split

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)

<b> Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()

In [28]:
X_train = scaler.fit_transform(X_train)

In [29]:
X_test = scaler.transform(X_test)

<b> Random Forest Regression

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
model = RandomForestRegressor(n_estimators=100, max_features=None)

In [32]:
model.fit(X_train,y_train)

RandomForestRegressor(max_features=None)

In [33]:
score = model.score(X_test,y_test)

In [34]:
score

0.7839232443124537

<b> Predicting on a new dataset

In [35]:
df_new = {"bat_team":[6,1,4,2], "bowl_team":[3,5,2,3], "runs":[90,56,110,129], "wickets":[2,2,4,8],
          "overs":[15.2,11.1,18.1,19.6], "runs_last_5":[19,12,36,48], "wickets_last_5":[0,1,0,1]}
df_new = pd.DataFrame(df_new)

In [36]:
df_new.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5
0,6,3,90,2,15.2,19,0
1,1,5,56,2,11.1,12,1
2,4,2,110,4,18.1,36,0
3,2,3,129,8,19.6,48,1


In [37]:
#Predicting on new dataset
model.predict(df_new)

array([234.72, 235.12, 234.72, 234.77])