In [30]:
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle as pc
import seaborn as sns
from sklearn.linear_model import LinearRegression


In [31]:
df = pd.read_csv('ipl.csv')
remove_columns =['mid','batsman','striker','non-striker','bowler']
df.drop(labels=remove_columns, axis=True, inplace=True)
df = df[df['overs']>=5.0]
df.head()

Unnamed: 0,date,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
32,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,61,0,5.1,59,0,222
33,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.2,59,1,222
34,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.3,59,1,222
35,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.4,59,1,222
36,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.5,58,1,222


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56707 entries, 32 to 76013
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            56707 non-null  object 
 1   venue           56707 non-null  object 
 2   bat_team        56707 non-null  object 
 3   bowl_team       56707 non-null  object 
 4   runs            56707 non-null  int64  
 5   wickets         56707 non-null  int64  
 6   overs           56707 non-null  float64
 7   runs_last_5     56707 non-null  int64  
 8   wickets_last_5  56707 non-null  int64  
 9   total           56707 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 4.8+ MB


In [34]:
print(df['date'])

32       2008-04-18
33       2008-04-18
34       2008-04-18
35       2008-04-18
36       2008-04-18
            ...    
76009    2017-05-21
76010    2017-05-21
76011    2017-05-21
76012    2017-05-21
76013    2017-05-21
Name: date, Length: 56707, dtype: object


In [35]:
from datetime import datetime
df['date'] = df['date'].apply(lambda x:datetime.strptime(x,"%Y-%m-%d"))
df['bat_team'].unique()


array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant'], dtype=object)

In [36]:
print(df['date'].dtype)

datetime64[ns]


In [37]:
current_team = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals','Mumbai Indians', 'Kings XI Punjab','Royal Challengers Bangalore', 'Delhi Daredevils', 'Sunrisers Hyderabad']
df = df[(df['bat_team'].isin(current_team)) & (df['bowl_team'].isin(current_team))]
df['bat_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
       'Delhi Daredevils', 'Sunrisers Hyderabad'], dtype=object)

In [38]:
x = pd.get_dummies(data=df,columns=['bat_team','bowl_team'])

In [39]:
x = x[['date','runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
       'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils',
       'bat_team_Kings XI Punjab', 'bat_team_Kolkata Knight Riders',
       'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
       'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
       'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad',
       'total']]

In [40]:
X_train = x.drop(labels='total',axis=1)[x['date'].dt.year <= 2016]
X_test = x.drop(labels='total',axis=1)[x['date'].dt.year>=2017]

In [41]:
y_train = x[x['date'].dt.year<=2016]['total'].values
y_test = x[x['date'].dt.year>=2017]['total'].values

In [42]:
X_train.drop(labels='date',axis=True,inplace=True)
X_test.drop(labels='date',axis=True,inplace=True)

In [49]:
model = LinearRegression()
model.fit(X_train,y_train)
predicted = model.predict(X_test)
p_df = pd.DataFrame(predicted)

# sns.regplot(y_test,predicted)

In [51]:
from sklearn.metrics import r2_score

In [52]:
score = r2_score(y_test,predicted)

In [53]:
from sklearn import metrics

In [54]:
print("MSE:" ,metrics.mean_squared_error(y_test, predicted))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test,predicted)))

MSE: 251.00792310417265
RMSE: 15.84322956673205


In [57]:
df2 = pd.DataFrame(y_test)
df3 = pd.DataFrame(predicted)

filename = 'IPL_Score_prediction.pkl'
pc.dump(model,open(filename, 'wb'))