In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Import Transforrmed Dataset

In [2]:
filepath_train = r"S3_Part3_pipeline_pipeline_Train_DS.csv"
filepath_test = r"S3_Part3_pipeline_pipeline_Test_DS.csv"

In [3]:
d_train = pd.read_csv(filepath_train  ,index_col= 0 , header= 0)
d_test = pd.read_csv(filepath_test  ,index_col= 0 , header= 0)

In [4]:
d_train.head(3)

Unnamed: 0,crime_rate,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,dist,airport_NO,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_None,waterbody_River,price
0,1.750941,1.03528,1.019549,-0.074765,1.128863,-0.813975,1.194251,-0.019931,-0.356733,0.562485,-1.174428,1.0,0.0,0.0,1.0,0.0,0.0,10.2
1,0.283472,-0.161802,-0.077387,-0.218185,-0.552953,0.039554,-0.958566,0.209591,0.350918,0.562485,-0.56321,0.0,1.0,0.0,0.0,0.0,1.0,22.1
2,0.489127,1.251338,0.449488,0.947639,1.03622,1.794031,-1.399167,0.198726,0.03186,1.457145,-0.93483,0.0,1.0,0.0,0.0,1.0,0.0,41.3


In [5]:
d_test.head(3)

Unnamed: 0,crime_rate,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,dist,airport_NO,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_None,waterbody_River,price
0,-1.148833,-1.385161,-1.286608,1.417661,-1.069612,2.789815,-0.789224,1.533755,0.195398,0.318486,1.721522,1.0,0.0,1.0,0.0,0.0,0.0,34.6
1,0.009692,-0.701948,-0.422879,0.392417,-1.671787,0.513737,-1.731375,-0.409711,0.363912,0.643817,-0.203814,0.0,1.0,0.0,0.0,1.0,0.0,31.5
2,-1.062466,0.134549,0.173094,-0.221025,0.298645,-1.193321,-0.262952,-0.897276,0.003244,-1.552167,-0.734351,0.0,1.0,0.0,1.0,0.0,0.0,20.6


## STEP 2 :  Train - Test Split

In [6]:
X_train = d_train.drop(labels= ['price'] , axis = 1)
y_train = d_train['price']

X_test = d_test.drop(labels= ['price'] , axis = 1)
y_test = d_test['price']

In [7]:
X_train.shape , y_train.shape

((404, 17), (404,))

In [8]:
X_test.shape , y_test.shape

((102, 17), (102,))

## STEP 3 : Training models

In [9]:
from sklearn.neighbors import KNeighborsRegressor

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [11]:
from sklearn.metrics import mean_squared_error,r2_score

### Various Algorithms for Regression Task : 

- Linear Regression, Polynomial Regression, Ridge Regression, Lasso Regression, Elastic Net Regression.
- Support Vector Regression (SVR),
- Decision Tree Regression, 
- Random Forest Regression, and 
- Gradient Boosting Regression.

In [12]:
models = [LinearRegression(), SVR(),KNeighborsRegressor(),DecisionTreeRegressor(), RandomForestRegressor()]

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
def ModelTraining(ML_models , X_train ,y_train ,X_test,y_test):
    
    for model in ML_models :

        model.fit(X_train.values, y_train)
        y_hat_train = model.predict(X_train.values)
        y_hat_test = model.predict(X_test.values)

        print(f"******************* \33[1m{str(model)}\33[0m *******************\n")

        print("\33[1mTraining Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_train,y_hat_train),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_train,y_hat_train)),2))
        print("R2 Square : ",r2_score(y_train,y_hat_train))

        print() 

        print("\33[1mTesting Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_test,y_hat_test),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_hat_test)),2))
        print("R2 Square : ",r2_score(y_test,y_hat_test))


        print()

In [15]:
ModelTraining(ML_models = models ,  X_train = X_train ,y_train = y_train ,
                   X_test = X_test, y_test = y_test)

******************* [1mLinearRegression()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  18.34
RMSE :  4.28
R2 Square :  0.7735816921822523

[1mTesting Report :[0m 

Mean Squared Error :  21.34
RMSE :  4.62
R2 Square :  0.7788496168668546

******************* [1mSVR()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  25.22
RMSE :  5.02
R2 Square :  0.6885310921865826

[1mTesting Report :[0m 

Mean Squared Error :  30.35
RMSE :  5.51
R2 Square :  0.6854907633125435

******************* [1mKNeighborsRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  15.61
RMSE :  3.95
R2 Square :  0.8072790749977554

[1mTesting Report :[0m 

Mean Squared Error :  21.36
RMSE :  4.62
R2 Square :  0.7786733229670815

******************* [1mDecisionTreeRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  0.0
RMSE :  0.0
R2 Square :  1.0

[1mTesting Report :[0m 

Mean Squared 

In [17]:
y_test.describe()

count    102.000000
mean      22.129412
std        9.871397
min        6.300000
25%       15.575000
50%       20.900000
75%       24.650000
max       50.000000
Name: price, dtype: float64

## HyperParameter Tuning

###  Choosing best models

- Random forest classifier

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
random_forest = RandomForestRegressor()

In [25]:
Param_grid = [
                {
                    "max_depth": [7,9,11,13],
                    "n_estimators": [150,175,200,250]
                }
             ]

#### Various Scoring Parameters for Regression

    'neg_mean_absolute_error'   :     metrics.mean_absolute_error

    'neg_mean_squared_error'    :     metrics.mean_squared_error

    'neg_root_mean_squared_error'  :  metrics.mean_squared_error

    'neg_mean_squared_log_error'  :   metrics.mean_squared_log_error

    'neg_median_absolute_error'   :   metrics.median_absolute_error

    'r2'                          :   metrics.r2_score

In [21]:
def GridCV(Model , Param_Grid ,Scoring , CV ,X_train ,y_train ,X_test ,y_test):
    
    # Creating object of GridSearchCV class
    grid_search = GridSearchCV(estimator = Model, param_grid =  Param_Grid, 
                               scoring = Scoring , cv = CV)
    
    # Fitting to the object
    grid_search.fit(X_train.values , y_train)
    
    # Printing the Best parameters
    print("Grid Search CV Best Parameters : ",grid_search.best_params_)
    
    # Mean cross-validated score of the best_estimator
    print("Grid Search CV Best Score : ",grid_search.best_score_,"\n")
    
    # Best Fitted Model
    best_model = grid_search.best_estimator_
    
    # Prediction : Trian and Test
    y_hat_GSCV_train = best_model.predict(X_train.values)
    y_hat_GSCV_test = best_model.predict(X_test.values)

    print("\33[1mResults after HyperParameter Tuning\33[0m \n")
    print("\33[1mTraining Report :\33[0m \n")
    print("Mean Squared Error : ",round(mean_squared_error(y_train,y_hat_GSCV_train),2))
    print("RMSE : ",round(np.sqrt(mean_squared_error(y_train,y_hat_GSCV_train)),2))
    print("R2 Square : ",r2_score(y_train,y_hat_GSCV_train))
    print("\n")


    print("\33[1mTesting Report :\33[0m \n")
    print("Mean Squared Error : ",round(mean_squared_error(y_test,y_hat_GSCV_test),2))
    print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_hat_GSCV_test)),2))
    print("R2 Square : ",r2_score(y_test,y_hat_GSCV_test))

In [26]:
GridCV(Model = random_forest , Param_Grid =Param_grid ,Scoring = "r2" , CV = 3 ,
       X_train = X_train ,y_train = y_train ,X_test = X_test ,y_test = y_test)

Grid Search CV Best Parameters :  {'max_depth': 11, 'n_estimators': 150}
Grid Search CV Best Score :  0.802177461187763 

[1mResults after HyperParameter Tuning[0m 

[1mTraining Report :[0m 

Mean Squared Error :  1.78
RMSE :  1.33
R2 Square :  0.9779954870423716


[1mTesting Report :[0m 

Mean Squared Error :  11.14
RMSE :  3.34
R2 Square :  0.8845372248713267
