## **Author - Nirmal Maheta**
Here I have trained all regression models to know which model should we select for our Dataset except Simple Linear Regression cause the dataset has multiple independent Variables.

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


here PE is the dependent variable while rest are features.


In [4]:
dataset.isnull().sum() #Hence there are no null values and no further data preprocessing required.

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

## Splitting the dataset into the Training set and Test set

In [5]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# **1.Training the Multiple Linear Regression model**

In [7]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train , y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting test set results.

In [8]:
y_pred = lr.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[431.43 431.23]
 [458.56 460.01]
 [462.75 461.14]
 ...
 [469.52 473.26]
 [442.42 438.  ]
 [461.88 463.28]]


## Evaluating Multiple Linear Regression model

In [9]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9325315554761302

# **2.Training the Polynomial Regression model on the training set.**

In [10]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=4)
X_poly= poly.fit_transform(X_train)
lr2 = LinearRegression()
lr2.fit(X_poly , y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [11]:
y_pred = lr2.predict(poly.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[433.94 431.23]
 [457.9  460.01]
 [460.52 461.14]
 ...
 [469.53 473.26]
 [438.27 438.  ]
 [461.66 463.28]]


## Evaluating Polynomial Linear Regression model

In [12]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9458192606428238

# **3.Support Vector Regression (SVR)**

In [13]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [14]:
y = y.reshape(len(y),1)

## Splitting the dataset into the Training set and Test set

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

## Training the SVR model on the Training set

In [17]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Predicting the Test set results

In [18]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[434.05 431.23]
 [457.94 460.01]
 [461.03 461.14]
 ...
 [470.6  473.26]
 [439.42 438.  ]
 [460.92 463.28]]


## Evaluating the SVR Model Performance

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9480784049986258

# **4.Decision Tree Regression**

In [20]:
X = dataset.iloc[:, :-1].values # I have performed this step again because svr model which is build above would have changed the value cause we applied standardization.
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Decision Tree Regression model on the Training set

In [22]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

## Predicting the Test set results

In [23]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[431.28 431.23]
 [462.81 460.01]
 [460.06 461.14]
 ...
 [471.46 473.26]
 [437.76 438.  ]
 [462.55 463.28]]


## Evaluating the Decision Tree Regression Model Performance

In [24]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9226091050550043

# **5.Random Forest Regression**

## Training the Random Forest Regression model on the whole dataset

In [25]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 10 , random_state = 0)
rfr.fit(X_train,y_train) 

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

## Predicting the Test set results

In [26]:
y_pred = rfr.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[433.78 431.23]
 [457.99 460.01]
 [463.14 461.14]
 ...
 [470.16 473.26]
 [439.51 438.  ]
 [460.32 463.28]]


## Evaluating the Random Forest Regression Model Performance

In [27]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9615980699813017

# **Comparing all the regression models.**

In [28]:
Dictionary = {'Multiple Linear Regression Model' : 0.93 , 'Polynomial Regression Model' : 0.94 , 'Support Vector Regression Model': 0.94  , 'Decision Tree Regression Model' : 0.92 , 'Random Forest Regression Model' : 0.96}

In [29]:
df = pd.DataFrame(list(Dictionary.items()),columns = ['Model','R2 Score'])

In [30]:
df

Unnamed: 0,Model,R2 Score
0,Multiple Linear Regression Model,0.93
1,Polynomial Regression Model,0.94
2,Support Vector Regression Model,0.94
3,Decision Tree Regression Model,0.92
4,Random Forest Regression Model,0.96


# CONCLUSIONS :

1. Among all the regression model trained above, Random Forest Regression Model gives best R2-Score of 0.96.So it's best model for current Dataset.

2. Feature Scaling is to be applied only on SVR Model Because in Multiple , Simple and Polynomial Regression, the coeffients of features already takes care of that.
Decision trees and ensemble methods(Random Forest Regression) do not require feature scaling to be performed on them as they are not sensitive to the the variance in the data.