# MLR with Linear Regression

## Importing pandas and numpy libraries:

In [226]:
import pandas as pd
import numpy as np
# Setting pandas print option to print decimal values upto 4 decimal places.
np.set_printoptions(precision=4, linewidth=100)

## Loading data set:

In [227]:
ipl_auction_df = pd.read_csv("IPL2013.csv")
ipl_auction_df.head(5)

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0,...,0.0,0,307,15,20.47,8.9,13.93,2009,50000,50000
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41,...,0.0,0,29,0,0.0,14.5,0.0,2008,50000,50000
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62,...,121.01,5,1059,29,36.52,8.81,24.9,2008,200000,350000
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56,...,76.32,0,1125,49,22.96,6.23,22.14,2011,100000,850000
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93,...,120.71,28,0,0,0.0,0.0,0.0,2011,100000,800000


In [228]:
ipl_auction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

In [229]:
ipl_auction_df.iloc[0:5, 0:10]

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93


In [230]:
# Note: T-RUNS = Test runs
X_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL', 'ECON', 'SR-BL']

- Let us try to address the problem of how to convert categorical variable into a number.

In [231]:
ipl_auction_df['PLAYING ROLE'].unique()

array(['Allrounder', 'Bowler', 'Batsman', 'W. Keeper'], dtype=object)

- Converting the different characters of categorical variables into some unit number is called **encoding**.
- Example:
    - Considering all characters of 'PLAYING ROLE' feature,
    - it is looking to be:  
        - Allrounder
        - Bowler
        - Batsman
        - W. Keeper
    - After performing encoding operation on playing role, the playing roles are encoded as follows:

In [232]:
pd.get_dummies(ipl_auction_df['PLAYING ROLE'])[0:5]

Unnamed: 0,Allrounder,Batsman,Bowler,W. Keeper
0,1,0,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0


- get_dummies are used to convert categorical variable into a number.

In [233]:
categorical_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']

In [234]:
ipl_auction_encoded_df = pd.get_dummies(ipl_auction_df[X_features], columns = categorical_features, drop_first = True)
ipl_auction_encoded_df

Unnamed: 0,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,ODI-WKTS,ODI-SR-BL,RUNS-S,HS,AVE,SR-B,...,COUNTRY_NZ,COUNTRY_PAK,COUNTRY_SA,COUNTRY_SL,COUNTRY_WI,COUNTRY_ZIM,PLAYING ROLE_Batsman,PLAYING ROLE_Bowler,PLAYING ROLE_W. Keeper,CAPTAINCY EXP_1
0,0,0,0,0.00,0,0.0,0,0,0.00,0.00,...,0,0,1,0,0,0,0,0,0,0
1,214,18,657,71.41,185,37.6,0,0,0.00,0.00,...,0,0,0,0,0,0,0,1,0,0
2,571,58,1269,80.62,288,32.9,167,39,18.56,121.01,...,0,0,0,0,0,0,0,1,0,0
3,284,31,241,84.56,51,36.8,58,11,5.80,76.32,...,0,0,0,0,0,0,0,1,0,0
4,63,0,79,45.93,0,0.0,1317,71,32.93,120.71,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,0,0,0,0.00,0,0.0,49,16,9.80,125.64,...,0,0,0,0,0,0,1,0,0,0
126,6398,7,6814,75.78,3,86.6,3,3,3.00,42.85,...,0,1,0,0,0,0,1,0,0,1
127,1775,9,8051,87.58,109,44.3,1237,66,26.32,131.88,...,0,0,0,0,0,0,1,0,0,1
128,1114,288,790,73.55,278,35.4,99,23,9.90,91.67,...,0,0,0,0,0,0,0,1,0,0


In [235]:
X_features = ipl_auction_encoded_df.columns

In [236]:
import statsmodels.api as sm
X = sm.add_constant( ipl_auction_encoded_df )
Y = ipl_auction_df['SOLD PRICE']

In [237]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size=0.8, random_state=42)

## 1. Fit a linear regression model on the training set

## Train the model

In [238]:
from sklearn.linear_model import LinearRegression

In [239]:
model = LinearRegression().fit(train_X, train_y)
print(model)

LinearRegression()


In [240]:
pred_y = model.predict( test_X )
print(pred_y)

[ 631256. -350836. 1298126.  154516.  196808.   51522.  523352.  794306. 1398710. 1951888. 1428340.
 1330898. -469860.  994766.  965882. 2457054.  641544.   -6544. -229876.  799190.  384708. 1061282.
  507130.  245580.  -99250.  552470.]


## Calculate the R2 Score

In [241]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

3.3529130230294006

## Calculate Mean Squared Error:

In [242]:
import numpy 
np.sqrt(mean_squared_error(test_y, pred_y))

769994.3004817034

## 2. Fit a Decision Tree regression model on the training set

In [243]:
# Train the model
from sklearn.tree import DecisionTreeRegressor

# Fit a linear regression model on the training set
model = DecisionTreeRegressor().fit(train_X, train_y)
print(model)

DecisionTreeRegressor()


## Evaluate the Trained Model by predicting on validation set


In [244]:
pred_y = model.predict(test_X)
pred_y

array([ 850000.,  100000.,  300000.,  800000.,  675000.,  175000.,  875000.,  875000.,  950000.,
       1550000.,  850000., 1600000.,  450000.,  675000.,  300000.,  100000.,  700000.,  450000.,
        625000.,  175000.,  525000., 1600000.,  850000.,  400000.,  125000.,  450000.])

In [245]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.7851615530817271

- so the model only explains 78.5% of the variance in the validation set.

## 3. MLR using Ensemble algorithm with bagging (Random forest)

## Fitting a RandomForestRegressor model as ensemble algorithm on the training set

In [246]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor().fit(train_X, train_y)
print(model)



RandomForestRegressor()


In [247]:
pred_y = model.predict( test_X )
print(pred_y)

[ 574950.  263180.  902700.  586250.  336550.  405150.  500200.  542700.  939950. 1238750.  564350.
  903700.  525640.  556340.  779250.  212750.  688250.  541300.  982000.  476700.  728000. 1001250.
  624700.  358100.  345290.  333200.]


In [248]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.20651372751344

- so the model only explains 20.6% of the variance in the validation set.

## 4. MLR using Ensemble algorithm with boosting

## Gradient boosting

In [249]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size = 0.8, random_state=100)


## Train the model

In [250]:
from sklearn.ensemble import GradientBoostingRegressor

## Fit a GradientBoostingRegressor algorithm model on the training set
## TRy : ADAboost.XGboost

In [251]:
model = GradientBoostingRegressor().fit(train_X, train_y)
print(model)

GradientBoostingRegressor()


In [252]:
pred_y = model.predict(test_X)
pred_y

array([ 323879.9144,  609524.8714,  560259.0613,  464672.7573,  159024.1761,  223987.0265,
       1117041.9484,  496577.3794,  693636.8779,  835741.9335,  804918.6461, 1075632.5027,
        657252.1519,   71851.7457,  324263.1909,  629739.2822,  412240.7384,  350080.556 ,
        646369.1492,  553912.4162,  865049.6315,  881841.1094,  341551.5318, 1112198.6528,
       1122416.4131,  299298.8122])

In [253]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))


0.3863212664183344

- so the model only explains 38.6% of the variance in the validation set.