# IPL Score Prediction using Machine Learning


##Importing libraries

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn

ModuleNotFoundError: No module named 'seaborn'

##Importing dataset

In [None]:
score_df = pd.read_csv('ipl_data.csv')

In [None]:
print(score_df.head())

In [None]:
print(score_df.info())

In [None]:
score_df.describe()

In [None]:
score_df.dtypes

##Visualizing data

In [None]:
#runs distribution
sn.displot(score_df['total'],kde=False,bins=10)
plt.title('Runs Distribution')
plt.show()

##Cleaning the dataset and dropping null values

Here we clean columns that won't provide any relevant information for our model to train

In [None]:
irrelevant = ['mid', 'date', 'venue','batsman', 'bowler', 'striker', 'non-striker']
score_df.drop(irrelevant,axis=1,inplace=True)
score_df.dropna(axis=0,inplace=True)

##Keeping only consistent teams

In [None]:
const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
              'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
              'Delhi Daredevils', 'Sunrisers Hyderabad']

In [None]:
score_df = score_df[(score_df['bat_team'].isin(const_teams)) & (score_df['bowl_team'].isin(const_teams))]

In [None]:
print(score_df.shape)

##Removing the first 5 overs of every match

In [None]:
score_df = score_df[score_df['overs'] >= 5.0]

In [None]:
score_df.head()

In [None]:
score_df.shape

##Encoding categorical variables

###Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['bat_team', 'bowl_team']:
  score_df[col] = le.fit_transform(score_df[col])
score_df.head()

###One Hot Encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1])], remainder='passthrough')

In [None]:
score_df = np.array(ct.fit_transform(score_df))

Save the np array in a new DataFrame with transformed columns

In [None]:
cols = ['batting_team_Chennai Super Kings', 'batting_team_Delhi Daredevils', 'batting_team_Kings XI Punjab',
              'batting_team_Kolkata Knight Riders', 'batting_team_Mumbai Indians', 'batting_team_Rajasthan Royals',
              'batting_team_Royal Challengers Bangalore', 'batting_team_Sunrisers Hyderabad',
              'bowling_team_Chennai Super Kings', 'bowling_team_Delhi Daredevils', 'bowling_team_Kings XI Punjab',
              'bowling_team_Kolkata Knight Riders', 'bowling_team_Mumbai Indians', 'bowling_team_Rajasthan Royals',
              'bowling_team_Royal Challengers Bangalore', 'bowling_team_Sunrisers Hyderabad', 'runs', 'wickets', 'overs',
       'runs_last_5', 'wickets_last_5', 'total']
df = pd.DataFrame(score_df, columns=cols)

In [None]:
df.head()

#Model Building

##Training set and test set

In [None]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

##Ml Algorithms

In [None]:
models = dict()

1. **Multiple Regression**

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(lr.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(lr.score(X_test,y_test)*100)[:5]}%')
models['multreg'] = str(lr.score(X_test,y_test)*100)

2. **SVR**

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(svr.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(svr.score(X_test,y_test)*100)[:5]}%')
models['svm'] = str(svr.score(X_test,y_test)*100)

3. **KNR**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(knr.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(knr.score(X_test,y_test)*100)[:5]}%')
models['k-nr'] = str(knr.score(X_test,y_test)*100)

4. **Decision Tree Regressor**

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(dtr.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(dtr.score(X_test,y_test)*100)[:5]}%')
models['tree'] = str(dtr.score(X_test,y_test)*100)

5. **Random Forest Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(rf.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(rf.score(X_test,y_test)*100)[:5]}%')
models['forest'] = str(rf.score(X_test,y_test)*100)

6. **XGBoost**

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train,y_train)

In [None]:
print(f'Train Score : {str(xgb.score(X_train,y_train)*100)[:5]}%\n Test Score ; {str(xgb.score(X_test,y_test)*100)[:5]}%')
models['xgb'] = str(xgb.score(X_test,y_test)*100)

#Best Model

In [None]:
model_name = list(models.keys())
model_accuracy = list(map(float,models.values()))
plt.bar(model_name,model_accuracy)
plt.show()

From above, we can see that **Random Forest** performed the best, closely followed by **Decision Tree** and **KNR**

#Export Model

In [None]:
import pickle
filename = "ml_model.pkl"
pickle.dump(rf, open(filename, "wb"))