In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

import pandasql as psql

# pip install pandasql

In [2]:
# Load the Boston housing data

boston = pd.read_csv(r"D:\00 Datasets\Others\Data-04\Boston.csv", header=0)
boston.head()

Unnamed: 0,Rec_ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rec_ID   506 non-null    int64  
 1   crim     506 non-null    float64
 2   zn       506 non-null    float64
 3   indus    506 non-null    float64
 4   chas     506 non-null    int64  
 5   nox      506 non-null    float64
 6   rm       506 non-null    float64
 7   age      506 non-null    float64
 8   dis      506 non-null    float64
 9   rad      506 non-null    int64  
 10  tax      506 non-null    int64  
 11  ptratio  506 non-null    float64
 12  black    506 non-null    float64
 13  lstat    506 non-null    float64
 14  medv     506 non-null    float64
dtypes: float64(11), int64(4)
memory usage: 59.4 KB


In [4]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in boston.columns:
    if col != 'medv':
        IndepVar.append(col)

TargetVar = 'medv'

x = boston[IndepVar]
y = boston[TargetVar]

In [5]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_test_F1 = x_test.copy()

In [6]:
# Deleting the 'Rec_ID' variable which is not influence on target variable

del x_train['Rec_ID']
del x_test['Rec_ID']

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354 entries, 5 to 102
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     354 non-null    float64
 1   zn       354 non-null    float64
 2   indus    354 non-null    float64
 3   chas     354 non-null    int64  
 4   nox      354 non-null    float64
 5   rm       354 non-null    float64
 6   age      354 non-null    float64
 7   dis      354 non-null    float64
 8   rad      354 non-null    int64  
 9   tax      354 non-null    int64  
 10  ptratio  354 non-null    float64
 11  black    354 non-null    float64
 12  lstat    354 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 38.7 KB


In [9]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 173 to 447
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     152 non-null    float64
 1   zn       152 non-null    float64
 2   indus    152 non-null    float64
 3   chas     152 non-null    int64  
 4   nox      152 non-null    float64
 5   rm       152 non-null    float64
 6   age      152 non-null    float64
 7   dis      152 non-null    float64
 8   rad      152 non-null    int64  
 9   tax      152 non-null    int64  
 10  ptratio  152 non-null    float64
 11  black    152 non-null    float64
 12  lstat    152 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 16.6 KB


In [10]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# Multi-Regression Algorithm

In [11]:
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

bostonREG = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None, positive=False)  

bostonREG.fit(x_train, y_train) 

LinearRegression()

In [12]:
# Predict the model with test dataset

y_pred = bostonREG.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred), 3) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 3.412
Mean Squared Error (MSE): 22.537
Root Mean Squared Error (RMSE): 4.747
Mean Absolute Percentage Error (MAPE): 18.099999999999998 %
R2_score: 0.697546
Adj R Square:  0.688922


# Decision Treee Regressor

In [13]:
# Build the Decision Tree Regressor model

from sklearn.tree import DecisionTreeRegressor

bostonDTR = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, 
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, 
                                  random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0,
                                  min_impurity_split=None, ccp_alpha=0.0)

bostonDTR.fit(x_train, y_train)

DecisionTreeRegressor()

In [14]:
# Predict the model with test dataset

y_pred1 = bostonDTR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred1),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred1),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred1)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred1),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred1),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred1),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 3.264
Mean Squared Error (MSE): 24.101
Root Mean Squared Error (RMSE): 4.909
Mean Absolute Percentage Error (MAPE): 17.560000000000002 %
R2_score: 0.676554
Adj R Square:  0.667332


In [15]:
# Creating dataframe with actual vs predicted

Pred_Output2 = pd.DataFrame({'medv':y_test, 'medv_Pred':y_pred1})

# Merge two Dataframes on index of both the dataframes

ResultsFinal2 = x_test_F1.merge(Pred_Output2, left_index=True, right_index=True)
ResultsFinal2.head()

Unnamed: 0,Rec_ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,medv_Pred
173,174,0.09178,0.0,4.05,0,0.51,6.416,84.1,2.6463,5,296,16.6,395.5,9.04,23.6,27.5
274,275,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,32.4,35.1
491,492,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6,13.3
72,73,0.09164,0.0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52,22.8,24.7
452,453,5.09017,0.0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27,16.1,19.4


# Random Forest Regressor

In [16]:
# build a random forest regression model with optimized hyper parameters

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2,
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                  max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                  bootstrap=True, oob_score=False, n_jobs=None, random_state=None,
                                  verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

bostonRFR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred2 = bostonRFR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred2),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred2),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred2)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred2),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred2),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred2),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2.371
Mean Squared Error (MSE): 10.383
Root Mean Squared Error (RMSE): 3.222
Mean Absolute Percentage Error (MAPE): 12.57 %
R2_score: 0.860657
Adj R Square:  0.856684


# SVR - Gaussian kernel

In [17]:
# Build the SVR model

from sklearn.svm import SVR

bostonSVR = SVR(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001,
                epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=- 1)

bostonSVR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred3 = bostonSVR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred3),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred3),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred3)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred3),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred3),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred3),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 3.235
Mean Squared Error (MSE): 27.347
Root Mean Squared Error (RMSE): 5.229
Mean Absolute Percentage Error (MAPE): 15.58 %
R2_score: 0.632984
Adj R Square:  0.622519


# KNN Regressior

In [18]:
# Build the algorithm with KNN

from sklearn import neighbors

bostonKNN = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto',
                                          leaf_size=30, p=2, metric='minkowski',
                                          metric_params=None, n_jobs=None)
bostonKNN.fit(x_train, y_train)

# Predict the model with test dataset

y_pred4 = bostonKNN.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred4),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred4),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred4)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred4),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred4),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred4),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2.919
Mean Squared Error (MSE): 21.029
Root Mean Squared Error (RMSE): 4.586
Mean Absolute Percentage Error (MAPE): 13.96 %
R2_score: 0.717785
Adj R Square:  0.709738


# Hyperparameter Tuning for RF Regressor

In [19]:
# build a random forest regression model with optimized hyper parameters

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2,
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                  max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                  bootstrap=True, oob_score=False, n_jobs=None, random_state=None,
                                  verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

bostonRFR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred2 = bostonRFR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred2),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred2),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred2)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred2),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred2),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred2),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2.419
Mean Squared Error (MSE): 10.35
Root Mean Squared Error (RMSE): 3.217
Mean Absolute Percentage Error (MAPE): 12.85 %
R2_score: 0.861101
Adj R Square:  0.857141


In [20]:
# Implementation of random forest regression model using GridSearchCV 

from sklearn.model_selection import GridSearchCV

param_grid_RFR = {'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'],
              'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'criterion':['mae']}

# build a random forest regression model in this case and initialize the GridSearchCV

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(random_state = 1)

g_search = GridSearchCV(estimator = bostonRFR, param_grid = param_grid_RFR, cv = 3,
                        n_jobs = 1, verbose = 0, return_train_score=True)

In [21]:
g_search.fit(x_train, y_train);

print(g_search.best_params_)

{'bootstrap': True, 'criterion': 'mae', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 300}


In [22]:
# build a random forest regression model with optimized hyper parameters
# {'bootstrap': True, 'criterion': 'mae', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 300}

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(n_estimators=300, criterion='mae', max_depth=None, min_samples_split=2,
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='log2',
                                  max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                  bootstrap=True, oob_score=False, n_jobs=None, random_state=None,
                                  verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

bostonRFR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred2 = bostonRFR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred2),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred2),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred2)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred2),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred2),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred2),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2.348
Mean Squared Error (MSE): 10.699
Root Mean Squared Error (RMSE): 3.271
Mean Absolute Percentage Error (MAPE): 12.1 %
R2_score: 0.85641
Adj R Square:  0.852316


In [23]:
# Implementation of Model using RandomizedSearchCV 

import numpy as np

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100 , stop = 1000, num = 10)] # returns 10 numbers 

max_features = ['auto', 'log2']

max_depth = [int(x) for x in np.linspace(3, 10, num = 8)] 

max_depth.append(None)

bootstrap = [True, False]

r_grid = {'n_estimators': n_estimators, 'max_features': max_features,
          'max_depth': max_depth, 'bootstrap': bootstrap}

print(r_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'log2'], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, None], 'bootstrap': [True, False]}


In [24]:
# Hyperparameter Tuning for Random Forest modl by using RandomizedSearchCV

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(random_state = 1)

bostonRFR_R = RandomizedSearchCV(estimator=bostonRFR, param_distributions=r_grid,
                                 n_iter = 20, scoring='neg_mean_absolute_error',
                                 cv = 3, verbose=2, random_state=42, n_jobs=-1,
                                 return_train_score=True)

bostonRFR_R.fit(x_train, y_train);

print(bostonRFR_R.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False}


In [26]:
# Build a random forest regression model with optimized hyper parameters
# {'n_estimators': 1000, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False}

from sklearn.ensemble import RandomForestRegressor

bostonRFR = RandomForestRegressor(n_estimators=1000, criterion='mae', max_depth=10, min_samples_split=2,
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='log2',
                                  max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                  bootstrap=False, oob_score=False, n_jobs=None, random_state=None,
                                  verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

bostonRFR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred2 = bostonRFR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred2),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred2),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred2)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred2),4) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred2),6))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred2),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2.29
Mean Squared Error (MSE): 9.699
Root Mean Squared Error (RMSE): 3.114
Mean Absolute Percentage Error (MAPE): 11.78 %
R2_score: 0.869837
Adj R Square:  0.866126


In [None]:
# print(bostonRFR.score(x_test,y_test))
# print(best_grid.score(x_test, y_test))
# print(best_random.score(x_test , y_test))

In [None]:
# What Is Hyperparameter Tuning?
# Hyperparameter tuning is the process of tuning the parameters present as the tuples while we build machine learning models. 
# These parameters are defined by us which can be manipulated according to programmer wish. Machine learning algorithms never 
# learn these parameters. These are tuned so that we could get good performance by the model. Hyperparameter tuning aims to 
# find such parameters where the performance of the model is highest or where the model performance is best and the error rate 
# is least.

In [None]:
# GridSearchCV - We define the combinations in "grid search" and do training of the model
# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None,
#             verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)

In [None]:
# RandomizedSearchCV - The model selects the combinations randomly 
#RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None,
#                   n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
#                   random_state=None, error_score=nan, return_train_score=False)