# IPL MATCH PREDICTION

> The main aim of this project is to predict the outcome of an IPL match by considering certain factors that are available in the dataset.

### Importing the required packages

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from pandas import DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
#importing the dataset
iplmatches = pd.read_csv('matches.csv')

In [None]:
#viewing few lines of the dataset
iplmatches.head(5)

In [None]:
#summarizing the dataset
iplmatches.describe()

In [None]:
#finding the pair-wise correlation of all the column pairs
iplmatches.corr()

### Visualization

In [None]:
#plot to visualize total number of matches each season

sns.countplot(x = 'season', data = iplmatches)
plt.title('Number of matches each season')
plt.show()

In [None]:
#plot depicting the total number of toss won by each team across all the seasons

sns.countplot( x = 'toss_winner', data = iplmatches)
plt.xticks(rotation='vertical')
plt.title('Number of tosses won')

In [None]:
def matches_played(str):
    return len(iplmatches[(iplmatches['team1']==str) | (iplmatches['team2']==str)])

matches_played_KKR = matches_played('Kolkata Knight Riders')
matches_played_MI = matches_played('Mumbai Indians')
matches_played_KXP = matches_played('Kings XI Punjab') 
matches_played_CSK = matches_played('Chennai Super Kings') 
matches_played_GL = matches_played('Gujarat Lions') 
matches_played_DD = matches_played('Delhi Daredevils') | matches_played('Delhi Capitals') 
matches_played_RCB = matches_played('Royal Challengers Bangalore') 
matches_played_SRH = matches_played('Sunrisers Hyderabad') | matches_played('Deccan Chargers') 
matches_played_RPS = matches_played('Rising Pune Supergiants') 
matches_played_RR = matches_played('Rajasthan Royals')

matches_played_RPS

In [None]:
rps_won_df = iplmatches[iplmatches['winner'] == 'Rising Pune Supergiants']
kkr_won_df = iplmatches[iplmatches['winner'] == 'Kolkata Knight Riders']
kxip_won_df = iplmatches[iplmatches['winner'] == 'Kings XI Punjab']
rcb_won_df = iplmatches[iplmatches['winner'] == 'Royal Challengers Bangalore']
srh_won_df = iplmatches[(iplmatches['winner'] == 'Sunrisers Hyderabad') | (iplmatches['winner'] == 'Deccan Chargers')]
mi_won_df = iplmatches[iplmatches['winner'] == 'Mumbai Indians']
gl_won_df = iplmatches[iplmatches['winner'] == 'Gujarat Lions']
dd_won_df = iplmatches[(iplmatches['winner'] == 'Delhi Daredevils') | (iplmatches['winner'] == 'Delhi Capitals')]
csk_won_df = iplmatches[iplmatches['winner'] == 'Chennai Super Kings']
rr_won_df = iplmatches[iplmatches['winner'] == 'Rajasthan Royals']

rps_wins = len(rps_won_df)
kkr_wins = len(kkr_won_df)
kxip_wins = len(kxip_won_df)
rcb_wins = len(rcb_won_df)
srh_wins = len(srh_won_df)
mi_wins = len(mi_won_df)
gl_wins = len(gl_won_df)
dd_wins = len(dd_won_df)
csk_wins = len(csk_won_df)
rr_wins = len(rr_won_df)

rps_wins

In [None]:
n_bins = 10
ind = np.arange(n_bins)
width = 0.50

plt.figure(figsize=(10,10))

matches_played=[matches_played_RPS, matches_played_KKR, matches_played_KXP, matches_played_RCB, matches_played_SRH, 
                matches_played_MI, matches_played_GL, matches_played_DD, matches_played_CSK, matches_played_RR]
matches_won=[rps_wins, kkr_wins, kxip_wins, rcb_wins, srh_wins, mi_wins, gl_wins, dd_wins, csk_wins, rr_wins]

p1 = plt.bar(ind, matches_played, width, color='LightSkyBlue')
p2 = plt.bar(ind, matches_won, width, color='Lime')

plt.ylabel('Number of Matches')
plt.xlabel('Teams')
plt.title('Overall performance of the team')
plt.xticks(ind + width/2., ('RPS', 'KKR', 'KXIP', 'RCB', 'SRH', 'MI', 'GL', 'DD', 'CSK', 'RR'))
plt.yticks(np.arange(0, 200, 5))
plt.legend((p1[0], p2[0]), ('matches_played', 'matches_won'))

In [None]:
#Toss winners decision

decision = iplmatches['toss_decision'].value_counts()
plt.pie(decision, labels = decision.index, startangle = 90, counterclock = False,autopct='%1.1f%%',shadow=True)
plt.axis('square')
plt.title('Toss winners decision',size = 15)

In [None]:
#combined the toss_winner and ipl winner 
#true and false are total combinations of the condition

winneroft = iplmatches['toss_winner'] == iplmatches['winner']
winneroft.groupby(winneroft).size()
sns.countplot(winneroft).set_title('Comparision of toss and match winners')

In [None]:
#seasonwise count of wins by team who won toss and won matches
winneroftoss = iplmatches[(iplmatches['toss_winner']) == (iplmatches['winner'])]
wot = sns.countplot( x = 'winner', hue='season', data=winneroftoss)
sns.set(rc={'figure.figsize':(22,18)})
plt.xticks(rotation = 'vertical')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("Teams")
plt.ylabel("Number of Wins")
plt.title("Number of Teams who won, given they won the toss")
plt.show(wot)

In [None]:
iplmatches['winner'].unique()

In [None]:
labels = ['RPS','KKR','KXIP','RCB','SRH','MI','GL','DD','CSK','RR']
sizes = [rps_wins, kkr_wins, kxip_wins, rcb_wins, srh_wins, mi_wins, gl_wins, dd_wins, csk_wins, rr_wins]
colors = ['#86EDF9','#2F2057','#FD0064','#FD0000','#FD9200','#0049FD','#FDB000','#0086FD','#FDFD00','#1B00FD']
explode = (0, 0, 0, 0.25, 0, 0, 0, 0, 0, 0)

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,radius=0.5)
plt.title("Number of matches won by each team through the years")

In [None]:
#number of times home team has won the match

won = iplmatches['team1'] == iplmatches['winner']
winneroft.groupby(winneroft).size()
sns.set(rc={'figure.figsize':(10,6)})
sns.countplot(winneroft).set_title('Comparision of toss and match winners')

In [None]:
#number matches in bengaluru that RCB has won/lost

matches_in_bengaluru = iplmatches[(iplmatches['city'] == 'Bengaluru') | (iplmatches['city'] == 'Bangalore')]
matches_won_by_RCB = len(matches_in_bengaluru[matches_in_bengaluru['winner'] == 'Royal Challengers Bangalore'])
matches_lost_by_RCB = len(matches_in_bengaluru[matches_in_bengaluru['winner'] != 'Royal Challengers Bangalore'])
data = (matches_won_by_RCB, matches_lost_by_RCB)
rcb = pd.DataFrame(data = data, index = ['won','lost'])
plt.pie(rcb, labels = rcb.index, startangle = 90, counterclock = False,autopct='%1.1f%%',shadow=True)
plt.axis('square')
plt.title("RCB's home matches results",size = 25)

In [None]:
#top player of matches winners

plt.figure(figsize=(10,10))
top_players = iplmatches['player_of_match'].value_counts().head(10)
top_players.plot(kind = 'bar')
plt.title("Players with the most number of 'man of the match' award")
plt.xticks(rotation = 90)

In [None]:
team_wins = iplmatches['winner'].value_counts()
team_wins.columns = ['Teams','Wins']

plt.plot(team_wins, color = 'red')
plt.title('Total number of wins',size = 20)
plt.xticks(rotation = 90)

In [None]:
fig, ax = plt.subplots()
for color in ['tab:blue', 'tab:orange', 'tab:green']:
    n = 750
    scale = 200.0 * np.random.rand(n)
    ax.scatter(iplmatches['team1'],iplmatches['team2'], c=color, s=scale, label=color,
               alpha=0.3, edgecolors='none')

ax.legend()
ax.grid(True)

plt.title('Number of matches played against each other',size = 20)
plt.xticks(rotation = 90)
plt.show()

In [None]:
#plotting the heatmap of the correlation matrix

corr = iplmatches.corr()
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.set_yticklabels(ax.get_yticklabels(), rotation = 0)

#### Importing the second dataset

In [None]:
matches = pd.read_csv('matches_cleaned.csv')
matches.head(5)

### Cleaning the dataset

In [None]:
# Make a copy of the dataset that you imported or used before

copy_data = matches.copy()
copy_data['city'].fillna('Dubai',inplace=True)
copy_data['umpire1'].fillna('Aleem Dar',inplace=True)

In [None]:
#check for NULL or NaN values

null_values_col = copy_data.isnull().sum()
null_values_col = null_values_col[null_values_col != 0].sort_values(ascending = False).reset_index()
null_values_col.columns = ["variable", "number of missing"]
null_values_col.head()

In [None]:
df = DataFrame(copy_data,columns=['team1', 'team2', 'toss_decision','toss_winner','city', 'venue', 'season', 'win_by_runs', 'win_by_wickets', 'umpire1', 'Home win', 'winner'])

In [None]:
#replace team names with abbrevations

df.replace(['Mumbai Indians','Kolkata Knight Riders','Royal Challengers Bangalore','Deccan Chargers','Chennai Super Kings',
                 'Rajasthan Royals','Delhi Daredevils','Gujarat Lions','Kings XI Punjab',
                 'Sunrisers Hyderabad','Rising Pune Supergiants','Kochi Tuskers Kerala','Pune Warriors','Delhi Capitals']
                ,['MI','KKR','RCB','DC','CSK','RR','DD','GL','KXIP','SRH','RPS','KTK','PW','DCP'],inplace=True)

In [None]:
#Encoding the categorical data manually

encode = {'team1': {'MI':1,'KKR':2,'RCB':3,'DC':4,'CSK':5,'RR':6,'DD':7,'GL':8,'KXIP':9,'SRH':10,'RPS':11,'KTK':12,'PW':13,'DCP':14},
          'team2': {'MI':1,'KKR':2,'RCB':3,'DC':4,'CSK':5,'RR':6,'DD':7,'GL':8,'KXIP':9,'SRH':10,'RPS':11,'KTK':12,'PW':13,'DCP':14},
          'toss_winner': {'MI':1,'KKR':2,'RCB':3,'DC':4,'CSK':5,'RR':6,'DD':7,'GL':8,'KXIP':9,'SRH':10,'RPS':11,'KTK':12,'PW':13,'DCP':14},
          'winner': {'MI':1,'KKR':2,'RCB':3,'DC':4,'CSK':5,'RR':6,'DD':7,'GL':8,'KXIP':9,'SRH':10,'RPS':11,'KTK':12,'PW':13,'DCP':14,'Draw':15},
          'city': {'Hyderabad':1, 'Pune':2, 'Rajkot':3, 'Indore':4, 'Bangalore':5, 'Mumbai':6,
                   'Kolkata':7, 'Delhi':8, 'Chandigarh':9, 'Kanpur':10, 'Jaipur':11, 'Chennai':12,
                   'Cape Town':16, 'Port Elizabeth':15, 'Durban':14, 'Centurion':13,
                   'East London':17, 'Johannesburg':18, 'Kimberley':19, 'Bloemfontein':20,
                   'Ahmedabad':25, 'Cuttack':24, 'Nagpur':23, 'Dharamsala':22, 'Kochi':21,
                   'Visakhapatnam':26, 'Raipur':27, 'Ranchi':28, 'Abu Dhabi':29, 'Sharjah':30,
                   'Mohali':31, 'Bengaluru':5,'Dubai':32}}
df.replace(encode, inplace=True)


In [None]:
#encoding the rest of the categorical data using sklearn's LabelEncoder class

from sklearn.preprocessing import LabelEncoder
var_mod = ['toss_decision', 'venue', 'umpire1']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])
    
df['city'].unique()

In [None]:
#check for NULL or NaN values and dropping those rows

df.isna().any()
row_having_nan =  df.isnull().any(axis=1)
rows_with_NaN = df[row_having_nan]
rows_with_NaN
df.dropna(subset = ["winner"], inplace=True)

In [None]:
#converting the type of winner and city columns from object to int 

df['winner'].astype(str).astype(int)
df['city'].astype(str).astype(int)
df.dtypes

In [None]:
#splitting the dataset

x = df[['team1', 'team2', 'toss_decision','toss_winner','city', 'venue', 'season','winner', 'win_by_runs', 'win_by_wickets', 'umpire1']]
y = df[['Home win']]

In [None]:
data = DataFrame(copy_data,columns=['season', 'win_by_runs', 'win_by_wickets', 'Home win'])
print(data.corr())

### Fitting regression models

In [None]:
from pandas import DataFrame
from sklearn import linear_model
import statsmodels.api as sm

regr = linear_model.LinearRegression()
regr.fit(x,y)

model = sm.OLS(y,x.astype(float)).fit()

print_model = model.summary()
print_model

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [None]:
#splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
#fit the logisticRegression class and predict for X_test values

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)    

print('Logistic Regression score of train data: {:.2f}'.format(logreg.score(X_train,y_train)))
print('Logistic Regression score of test data: {:.2f}'.format(logreg.score(X_test,y_test)))

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
train_score = []
test_score=[]

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
for i in np.arange(1,80):
    
    logreg = LogisticRegression(penalty = 'l2', C = i,random_state = 0)
    y_train=y_train.astype('int')
    logreg.fit(X_train,y_train.values.ravel()) 
    
    train_score.append(logreg.score(X_train, y_train))
    test_score.append(logreg.score(X_test,y_test))

In [None]:
#plot a graph to visualize training and testing accuracy

plt.plot(np.arange(1,80),train_score)
plt.plot(np.arange(1,80),test_score)
plt.legend(['Training Accuracy','Testing Accuracy'])
plt.title('Logistic Regression Tuning')
plt.xlabel('C')
plt.ylabel('Accuracy')

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
encode['team1']

In [None]:
df.head()

## Predict the winner of a particular match using Decision Tree Regression

> The input to the model will be the home team, the away team, the toss winner and the city the match is played in.
> This model will use the sklearn.tree's DecisionTreeRegression class for classification and predicts the outcome.
> The model also outputs the accuracy of the output predicted result.
> The decision tree graph at the end shows how the model predicts the result based on the different parameters.

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import sys

home_team = input('Enter team1\t')
away_team = input('Enter team2\t')
toss_winner = input('Toss winner\t')

if((toss_winner != home_team)):
    if(toss_winner != away_team):
        print("Invalid input")
        sys.exit()

city = input('Enter city\t')


x_ = df[((df['team1'] == encode['team1'][home_team]) & (df['team2'] == encode['team1'][away_team])) | ((df['team1'] == encode['team1'][away_team]) & (df['team2'] == encode['team1'][home_team]))]

x_train = x_[['team1','team2','toss_winner','city']]
y_train = x_[['winner']]

y_train = y_train.astype(int)
detreg = DecisionTreeRegressor()
detreg.fit(x_train, y_train)

data = { 'team1' : encode['team1'][home_team] , 'team2' : encode['team1'][away_team], 'toss_winner' : encode['team1'][toss_winner], 'city' : encode['city'][city] }
a = pd.DataFrame(data, index=[0])
result = detreg.predict(a)

for name, id1 in encode['team1'].items():
    if id1 == result[0]:
        print('\nProbable Winner is ',name)

print('Decision Tree Regression score: {}'.format(detreg.score(x_train,y_train)))

tree.plot_tree(detreg)

## Predict the winner of a particular match using Logistic Regression

> The input to the model will be the home team, the away team, the toss winner and the city the match is played in.
> This model will use the sklearn.linear_model's LogisticRegression class for classification and predicts the outcome.
> The model also outputs the accuracy of the output predicted result.

In [None]:
from sklearn.linear_model import LogisticRegression

home_team = input('Enter team1\t')
away_team = input('Enter team2\t')
toss_winner = input('Toss winner\t')

if((toss_winner != home_team)):
    if(toss_winner != away_team):
        print("Invalid input")
        sys.exit()

print('The cities are\n')

for key, value in encode['city'].items() :
    print(key)
city = input('Enter city\t')


x_ = df[((df['team1'] == encode['team1'][home_team]) & (df['team2'] == encode['team1'][away_team])) | ((df['team1'] == encode['team1'][away_team]) & (df['team2'] == encode['team1'][home_team]))]

x_train = x_[['team1','team2','toss_winner','city']]
y_train = x_[['winner']]

y_train = y_train.astype(int)
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

data = { 'team1' : encode['team1'][home_team] , 'team2' : encode['team1'][away_team], 'toss_winner' : encode['team1'][toss_winner], 'city' : encode['city'][city] }
a = pd.DataFrame(data, index=[0])
result = logreg.predict(a)

for name, id1 in encode['team1'].items():
    if id1 == result[0]:
        print('\nProbable Winner is ',name)

print('Logistic Regression score: {}'.format(logreg.score(x_train,y_train)))

## Predict the winner of a particular match using Naive Bayes Classifier

> The input to the model will be the home team, the away team, the toss winner and the city the match is played in.
> This model will use the sklearn.linear_model's LogisticRegression class for classification and predicts the outcome.
> The model also outputs the accuracy of the output predicted result.

In [None]:
from sklearn.naive_bayes import GaussianNB

home_team = input('Enter team1\t')
away_team = input('Enter team2\t')
toss_winner = input('Toss winner\t')

if((toss_winner != home_team)):
    if(toss_winner != away_team):
        print("Invalid input")
        sys.exit()

print('The cities are\n')

for key, value in encode['city'].items() :
    print(key)
city = input('Enter city\t')


x_ = df[((df['team1'] == encode['team1'][home_team]) & (df['team2'] == encode['team1'][away_team])) | ((df['team1'] == encode['team1'][away_team]) & (df['team2'] == encode['team1'][home_team]))]

x_train = x_[['team1','team2','toss_winner','city']]
y_train = x_[['winner']]

y_train = y_train.astype(int)
logreg = GaussianNB()
logreg.fit(x_train, y_train)

data = { 'team1' : encode['team1'][home_team] , 'team2' : encode['team1'][away_team], 'toss_winner' : encode['team1'][toss_winner], 'city' : encode['city'][city] }
a = pd.DataFrame(data, index=[0])
result = logreg.predict(a)

for name, id1 in encode['team1'].items():
    if id1 == result[0]:
        print('\nProbable Winner is ',name)

print('Naive Bayes Classifier score: {}'.format(logreg.score(x_train,y_train)))

## Conclusion



> The logistic regression model has a relatively higher score for most of the data as compared to the decision tree regression model. 

> This model is not the most accurate model and can only predict the outcome with minimum accuracy as the data available in the dataset is minimal. Many match influencing factors like players playing, the weather condition, form of the players, etc. are missing in the dataset. 
So, the further scope of this project will be to collect the data necessary for a proper accurate result and train the model accordingly