# Model Selection Process

## Importing the libraries

In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [24]:
# dataset = pd.read_csv('ENTER_THE_NAME_OF_YOUR_DATASET_HERE.csv')
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Training all Regression models on the Training set and measuring R2 values

## Linear Regression

In [26]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Getting Prediction
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating Model Performance
from sklearn.metrics import r2_score
Linear_R2 = r2_score(y_test, y_pred)
print("\nLinear reg R2 score:",r2_score(y_test, y_pred))

[[431.43 431.23]
 [458.56 460.01]
 [462.75 461.14]
 ...
 [469.52 473.26]
 [442.42 438.  ]
 [461.88 463.28]]

Linear reg R2 score: 0.9325315554761302


## Polynomial Regression

In [27]:
# import the PolynomialFeatures library from scikit-learn
from sklearn.preprocessing import PolynomialFeatures

# Creating our polynomial features and store them in X_poly
polyfeats = PolynomialFeatures(degree=4) #degree specifies the order 
# Store the polynomial features in X_poly
X_poly = polyfeats.fit_transform(X_train)


# import the linear regression library from scikit-learn
from sklearn.linear_model import LinearRegression

# creating lr model
Linregressor2 = LinearRegression() 

# fit the lr model on polynomial features X_poly not the actual X
Linregressor2.fit(X_poly, y_train)

# Getting Prediction
y_pred = Linregressor2.predict(polyfeats.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating Model Performance
from sklearn.metrics import r2_score
Poly_R2 = r2_score(y_test, y_pred)
print("\nPolynomial reg R2 score:",r2_score(y_test, y_pred))

[[433.94 431.23]
 [457.9  460.01]
 [460.52 461.14]
 ...
 [469.53 473.26]
 [438.27 438.  ]
 [461.66 463.28]]

Polynomial reg R2 score: 0.9458192606428147


## Support Vector Machine

*Note: SVR needs addition feature scaling process, in order to perform regression.

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling Process
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # for independent variables
sc2 = StandardScaler() # for dependent variable
X_train = sc.fit_transform(X_train) # Scale input values
y_train = y_train.reshape(len(y_train), 1)
y_train = sc2.fit_transform(y_train) # scale output values

In [29]:
#import support vector regression
from sklearn.svm import SVR 
#read more here: https://data-flair.training/blogs/svm-kernel-functions/
support_reg = SVR(kernel = 'rbf') 

#fit the regressor
support_reg.fit(X_train,y_train)

# Getting Prediction
y_pred = sc2.inverse_transform(support_reg.predict(sc.transform(X_test)))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating Model Performance
from sklearn.metrics import r2_score
SVR_R2 = r2_score(y_test, y_pred)
print("\nSupport vector reg R2 score:",r2_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


[[434.05 431.23]
 [457.94 460.01]
 [461.03 461.14]
 ...
 [470.6  473.26]
 [439.42 438.  ]
 [460.92 463.28]]

Support vector reg R2 score: 0.9480784049986258


## Decision Tree

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

# Getting Prediction
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating Model Performance
from sklearn.metrics import r2_score
Dec_tree_R2 = r2_score(y_test, y_pred)
print("\nDecision Tree reg R2 score:",r2_score(y_test, y_pred))

[[431.28 431.23]
 [462.81 460.01]
 [460.06 461.14]
 ...
 [471.46 473.26]
 [437.76 438.  ]
 [462.55 463.28]]

Decision Tree reg R2 score: 0.9226091050550043


## Random Forest 

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train, y_train)

# Getting Prediction
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating Model Performance
from sklearn.metrics import r2_score
Random_forest_R2 = r2_score(y_test, y_pred)
print("\nRandom forest reg R2 score:",r2_score(y_test, y_pred))

[[434.44 431.23]
 [456.31 460.01]
 [463.72 461.14]
 ...
 [469.58 473.26]
 [439.76 438.  ]
 [461.6  463.28]]

Random forest reg R2 score: 0.9611835323992642


# So, Which Model to select? Let's see...

In [32]:
print("Linear reg R2 score:",Linear_R2)
print("Polynomial reg R2 score:",Poly_R2)
print("Support vector reg R2 score:",SVR_R2)
print("Decision Tree reg R2 score:",Dec_tree_R2)
print("Random forest reg R2 score:",Random_forest_R2)

Linear reg R2 score: 0.9325315554761302
Polynomial reg R2 score: 0.9458192606428147
Support vector reg R2 score: 0.9480784049986258
Decision Tree reg R2 score: 0.9226091050550043
Random forest reg R2 score: 0.9611835323992642


Using random forest, we are getting the best r2 score, so after trying all of the models on this dataset, we can conclude that Random forest is the best model "for this dataset".