# 1. SLR with Linear Regression

## Importing pandas and numpy libraries:

In [114]:
import pandas as pd
import numpy as np
# Setting pandas print option to print decimal values upto 4 decimal places.
np.set_printoptions(precision=4, linewidth=100)

## Loading data set:

In [115]:
mba_salary_df = pd.read_csv('MBA Salary.csv')
mba_salary_df.head(10)

Unnamed: 0,S. No.,Percentage in Grade 10,Salary
0,1,62.0,270000
1,2,76.33,200000
2,3,72.0,240000
3,4,60.0,250000
4,5,61.0,180000
5,6,55.0,300000
6,7,70.0,260000
7,8,68.0,235000
8,9,82.8,425000
9,10,59.0,240000


In [116]:
mba_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   S. No.                  50 non-null     int64  
 1   Percentage in Grade 10  50 non-null     float64
 2   Salary                  50 non-null     int64  
dtypes: float64(1), int64(2)
memory usage: 1.3 KB


In [117]:
import statsmodels.api as sm
X = sm.add_constant( mba_salary_df['Percentage in Grade 10'] )
Y = mba_salary_df['Salary']
X.head(5)

Unnamed: 0,const,Percentage in Grade 10
0,1.0,62.0
1,1.0,76.33
2,1.0,72.0
3,1.0,60.0
4,1.0,61.0


## Splitting of data

In [118]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size = 0.8, random_state = 100)


## Building the model

In [119]:
# Train the model
from sklearn.linear_model import LinearRegression

# Fit a linear regression model on the training set
model = LinearRegression().fit(train_X, train_y)
print(model)


LinearRegression()


## Evaluate the Trained Model by predicting on validation set


In [120]:
pred_y = model.predict(test_X)
pred_y

array([279828.4025, 272707.2277, 215737.8296, 237101.3539, 295851.0457, 247070.9985, 226419.5917,
       308313.1015, 254904.2908, 295494.9869])

In [121]:
test_y

6     260000
36    177600
37    236000
28    360000
43    250000
49    300000
5     300000
33    330000
20    120000
42    300000
Name: Salary, dtype: int64

In [122]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.156645849742304

- so the model only explains 15.6% of the variance in the validation set.

In [123]:
import numpy 
np.sqrt(mean_squared_error(test_y, pred_y))

73458.04348346895

## 2. SLR using Decision tree

In [124]:
# Train the model
from sklearn.tree import DecisionTreeRegressor

# Fit a linear regression model on the training set
model = DecisionTreeRegressor().fit(train_X, train_y)
print(model)

DecisionTreeRegressor()


## Evaluate the Trained Model by predicting on validation set


In [125]:
pred_y = model.predict(test_X)
pred_y

array([231000.    , 238333.3333, 160000.    , 250000.    , 218000.    , 180000.    , 300000.    ,
       252000.    , 270000.    , 218000.    ])

In [126]:
test_y

6     260000
36    177600
37    236000
28    360000
43    250000
49    300000
5     300000
33    330000
20    120000
42    300000
Name: Salary, dtype: int64

- This should result in improvement of R2 score:

In [127]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.5676966322534405

- so the model only explains 56.7% of the variance in the validation set.

# 3. SLR using Ensemble algorithm with bagging (Random forest)

## Fitting a RandomForestRegressor model as ensemble algorithm on the training set

In [128]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor().fit(train_X, train_y)
print(model)



RandomForestRegressor()


In [129]:
pred_y = model.predict( test_X )
print(pred_y)

[251833.3333 243513.5714 164366.6667 234005.     235910.     196435.4762 285516.6667 257770.
 266700.     235910.    ]


In [130]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.44464580885529625

- so the model only explains 44.4% of the variance in the validation set.

## 4. SLR using Ensemble algorithm with boosting

## Gradient boosting

In [131]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size = 0.8, random_state=100)


## Train the model

In [132]:
from sklearn.ensemble import GradientBoostingRegressor

## Fit a GradientBoostingRegressor algorithm model on the training set
## TRy : ADAboost.XGboost

In [133]:
model = GradientBoostingRegressor().fit(train_X, train_y)
print(model)

GradientBoostingRegressor()


In [134]:
pred_y = model.predict(test_X)
pred_y

array([238033.7493, 240168.4107, 165742.8069, 235818.4884, 228541.8045, 194081.9988, 296015.3136,
       253750.2981, 259849.0594, 228541.8045])

In [135]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(test_y, pred_y))

0.43458391343482794

- so the model only explains 43.4% of the variance in the validation set.