<a href="https://colab.research.google.com/github/SRIKARREDDY-dotorg/Hand-Written-hindi-script-recognition/blob/master/Major_Project_POCHANA_SRIKAR_REDDY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

  ### Import the necessary libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns

### Read the ipl csv data from the drive, and view the head of dataset 

In [2]:
df = pd.read_csv("/content/drive/MyDrive/FYP/ipl2017.csv")
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


### Dataset has some rows and columns

In [3]:
df.shape

(76014, 15)

### No null values are present in dataset

In [4]:
df.isna().sum()

mid               0
date              0
venue             0
bat_team          0
bowl_team         0
batsman           0
bowler            0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
striker           0
non-striker       0
total             0
dtype: int64

In [5]:
df.columns

Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman', 'bowler',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker',
       'non-striker', 'total'],
      dtype='object')

### Date is unnecessary for the prediction so removed from dataset

In [6]:
df.drop(['date'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,mid,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


### There is a duplicate category i.e., Rising Pune supergaints has been classified as two teams due to spell mistake.

In [8]:
df['bat_team'].value_counts()

Mumbai Indians                 10213
Kings XI Punjab                 9219
Chennai Super Kings             9142
Royal Challengers Bangalore     8331
Kolkata Knight Riders           8033
Delhi Daredevils                7152
Rajasthan Royals                6643
Deccan Chargers                 5280
Sunrisers Hyderabad             5216
Pune Warriors                   2448
Gujarat Lions                   1726
Rising Pune Supergiant           994
Kochi Tuskers Kerala             876
Rising Pune Supergiants          741
Name: bat_team, dtype: int64

In [9]:
df['bowl_team'].value_counts()

Delhi Daredevils               10245
Royal Challengers Bangalore     9661
Mumbai Indians                  9271
Kolkata Knight Riders           9211
Kings XI Punjab                 8405
Rajasthan Royals                7773
Chennai Super Kings             6497
Deccan Chargers                 3984
Sunrisers Hyderabad             3589
Pune Warriors                   3083
Gujarat Lions                   1970
Rising Pune Supergiant           962
Kochi Tuskers Kerala             736
Rising Pune Supergiants          627
Name: bowl_team, dtype: int64

### Replace the above mentioned team to form a single team

In [10]:
df['bowl_team']=df['bowl_team'].replace('Rising Pune Supergiant','Rising Pune Supergiants')

In [11]:

df['bat_team']=df['bat_team'].replace('Rising Pune Supergiant','Rising Pune Supergiants')

### All the features should be numeric but there are categorical features

In [12]:
df.dtypes

mid                 int64
venue              object
bat_team           object
bowl_team          object
batsman            object
bowler             object
runs                int64
wickets             int64
overs             float64
runs_last_5         int64
wickets_last_5      int64
striker             int64
non-striker         int64
total               int64
dtype: object

### There is duplicate category for the single stadium., Punjab cricket Association stadium

In [13]:
df['venue'].value_counts()

M Chinnaswamy Stadium                                   7443
Feroz Shah Kotla                                        7068
Eden Gardens                                            7049
Wankhede Stadium                                        7048
MA Chidambaram Stadium, Chepauk                         5972
Rajiv Gandhi International Stadium, Uppal               5827
Punjab Cricket Association Stadium, Mohali              4247
Sawai Mansingh Stadium                                  4110
Dr DY Patil Sports Academy                              2088
Subrata Roy Sahara Stadium                              2086
Maharashtra Cricket Association Stadium                 1843
Kingsmead                                               1731
Sardar Patel Stadium, Motera                            1484
Brabourne Stadium                                       1380
SuperSport Park                                         1377
Punjab Cricket Association IS Bindra Stadium, Mohali    1342
Saurashtra Cricket Assoc

### Therefore the duplicate stadium has been replaced with same stadium

In [14]:
df['venue'] = df['venue'].replace('Punjab Cricket Association IS Bindra Stadium, Mohali','Punjab Cricket Association Stadium, Mohali')

In [15]:
df['venue'].value_counts()

M Chinnaswamy Stadium                                  7443
Feroz Shah Kotla                                       7068
Eden Gardens                                           7049
Wankhede Stadium                                       7048
MA Chidambaram Stadium, Chepauk                        5972
Rajiv Gandhi International Stadium, Uppal              5827
Punjab Cricket Association Stadium, Mohali             5589
Sawai Mansingh Stadium                                 4110
Dr DY Patil Sports Academy                             2088
Subrata Roy Sahara Stadium                             2086
Maharashtra Cricket Association Stadium                1843
Kingsmead                                              1731
Sardar Patel Stadium, Motera                           1484
Brabourne Stadium                                      1380
SuperSport Park                                        1377
Saurashtra Cricket Association Stadium                 1229
Himachal Pradesh Cricket Association Sta

### Dataset is in Dataframe type

In [16]:
type(df)

pandas.core.frame.DataFrame

In [17]:
df.head()

Unnamed: 0,mid,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


### Performing onehot encoding, since it is a regressor one-hot encoding is applied

In [18]:
Data = pd.get_dummies(columns=['venue','bat_team','bowl_team','batsman','bowler'],drop_first=False,data = df)

In [19]:
Data.head()

Unnamed: 0,mid,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total,venue_Barabati Stadium,venue_Brabourne Stadium,venue_Buffalo Park,venue_De Beers Diamond Oval,venue_Dr DY Patil Sports Academy,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Green Park,venue_Himachal Pradesh Cricket Association Stadium,venue_Holkar Cricket Stadium,venue_JSCA International Stadium Complex,venue_Kingsmead,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,venue_Nehru Stadium,venue_New Wanderers Stadium,venue_Newlands,venue_OUTsurance Oval,"venue_Punjab Cricket Association Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal","venue_Sardar Patel Stadium, Motera",venue_Saurashtra Cricket Association Stadium,venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_St George's Park,venue_Subrata Roy Sahara Stadium,...,bowler_Shivam Sharma,bowler_Shoaib Ahmed,bowler_Shoaib Malik,bowler_Sohail Tanvir,bowler_Sunny Gupta,bowler_Swapnil Singh,bowler_T Henderson,bowler_T Natarajan,bowler_T Shamsi,bowler_T Thushara,bowler_TA Boult,bowler_TG Southee,bowler_TL Suman,bowler_TM Dilshan,bowler_TM Head,bowler_TP Sudhindra,bowler_TS Mills,bowler_UT Yadav,bowler_Umar Gul,bowler_V Kohli,bowler_V Pratap Singh,bowler_V Sehwag,bowler_V Shankar,bowler_VR Aaron,bowler_VRV Singh,bowler_VS Malik,bowler_VS Yeligati,bowler_VY Mahesh,bowler_WA Mota,bowler_WD Parnell,bowler_WPUJC Vaas,bowler_Washington Sundar,bowler_Y Gnaneswara Rao,bowler_Y Nagar,bowler_Y Venugopal Rao,bowler_YA Abdulla,bowler_YK Pathan,bowler_YS Chahal,bowler_Yuvraj Singh,bowler_Z Khan
0,1,1,0,0.1,1,0,0,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0.2,1,0,0,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2,0,0.2,2,0,0,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,2,0,0.3,2,0,0,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,2,0,0.4,2,0,0,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Seperate the features and target from the dataset

In [20]:
X = Data.drop('total',axis=1)
y = Data['total']

### Split the train and test dataset of 0.3 test_size

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

### Perform scaling using minmaxscaler

*   Train features has to be fit and transformed
*   Test features has to be transformed




In [22]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Randomregressor is called from sklearn models,with oob score is true

In [26]:
model = RandomForestRegressor(n_estimators=250,oob_score=True,verbose=1)

### Train the model with training dataset

In [27]:
model.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  9.0min finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=250, n_jobs=None, oob_score=True,
                      random_state=None, verbose=1, warm_start=False)

### Check the model performace with the R2-score on testdataset

In [28]:
model.score(X_test,y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    2.2s finished


0.9729209318187646

In [29]:
y_pred = model.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    2.1s finished


### Performance Measures

In [31]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score,r2_score
print("The Mean squared error of this model is: ",mean_squared_error(y_test,y_pred))
print("The Mean absolute error of this model is: ",mean_absolute_error(y_test,y_pred))
print("The variance score of this model is: ",explained_variance_score(y_test,y_pred))
print("The R2-score of this model is: ",r2_score(y_test,y_pred))

The Mean squared error of this model is:  22.838042893049767
The Mean absolute error of this model is:  2.6213768910326682
The variance score of this model is:  0.9729210072749195
The R2-score of this model is:  0.9729209318187645
