In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Import Transformed Dataset

In [2]:
filepath_train = r"S3_Part3_test_pipeline_Train_DS.csv"
filepath_test = r"S3_Part3_test_pipeline_Test_DS.csv"

In [3]:
d_train = pd.read_csv(filepath_train  ,index_col= 0 , header= 0)
d_test = pd.read_csv(filepath_test  ,index_col= 0 , header= 0)

In [4]:
d_train.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-1.554442,1.636779,-1.240082,-0.970473,-1.064046,-0.249954,1.262319,-0.07829,0.031965,0.0,1.0,0.0,0.0,0.0,108300.0
1,-0.623047,-0.148644,-1.398615,0.278105,0.334906,-0.339735,-0.199874,0.040216,-0.029367,0.0,0.0,0.0,0.0,1.0,183200.0
2,-1.360194,2.267753,-0.84375,0.633399,0.262335,-0.272571,0.113653,-0.597804,-0.054939,0.0,1.0,0.0,0.0,0.0,118400.0


In [5]:
d_test.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,0.736691,-0.8544,0.583043,-0.704691,-0.862947,-0.086419,0.340263,-0.815927,-0.050302,1.0,0.0,0.0,0.0,0.0,245300.0
1,-1.340271,1.019826,1.851305,-0.466451,-0.235167,-1.380994,-0.540981,1.14067,-0.015714,0.0,0.0,0.0,1.0,0.0,104200.0
2,0.228657,-0.125275,-1.240082,0.863376,0.646173,0.505882,0.528296,-0.955074,0.012981,0.0,1.0,0.0,0.0,0.0,146200.0


## STEP 2 :  Train - Test Split

In [6]:
X_train = d_train.drop(labels= ['median_house_value'] , axis = 1)
y_train = d_train['median_house_value']

X_test = d_test.drop(labels= ['median_house_value'] , axis = 1)
y_test = d_test['median_house_value']

In [7]:
X_train.shape , y_train.shape

((16512, 14), (16512,))

In [8]:
X_test.shape , y_test.shape

((4128, 14), (4128,))

## STEP 3 : Training models

In [9]:
from sklearn.neighbors import KNeighborsRegressor

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [11]:
from sklearn.metrics import mean_squared_error,r2_score

### Various Algorithms for Regression Task : 

- Linear Regression, Polynomial Regression, Ridge Regression, Lasso Regression, Elastic Net Regression.
- Support Vector Regression (SVR),
- Decision Tree Regression, 
- Random Forest Regression, and 
- Gradient Boosting Regression.

In [12]:
models = [LinearRegression(), SVR(),KNeighborsRegressor(),DecisionTreeRegressor(), RandomForestRegressor()]

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
def ModelTraining(ML_models , X_train ,y_train ,X_test,y_test):
    
    for model in ML_models :

        model.fit(X_train.values, y_train)
        y_hat_train = model.predict(X_train.values)
        y_hat_test = model.predict(X_test.values)

        print(f"******************* \33[1m{str(model)}\33[0m *******************\n")

        print("\33[1mTraining Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_train,y_hat_train),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_train,y_hat_train)),2))
        print("R2 Square : ",r2_score(y_train,y_hat_train))

        print() 

        print("\33[1mTesting Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_test,y_hat_test),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_hat_test)),2))
        print("R2 Square : ",r2_score(y_test,y_hat_test))


        print()

In [15]:
ModelTraining(ML_models = models ,  X_train = X_train ,y_train = y_train ,
              X_test = X_test, y_test = y_test)

******************* [1mLinearRegression()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  4778382221.77
RMSE :  69125.84
R2 Square :  0.6400583546029388

[1mTesting Report :[0m 

Mean Squared Error :  4596215761.45
RMSE :  67795.4
R2 Square :  0.6588819117445954

******************* [1mSVR()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  13893902333.46
RMSE :  117872.4
R2 Square :  -0.046587283056869655

[1mTesting Report :[0m 

Mean Squared Error :  14011270917.79
RMSE :  118369.21
R2 Square :  -0.03987675896157272

******************* [1mKNeighborsRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  2417600642.17
RMSE :  49169.1
R2 Square :  0.8178891698761604

[1mTesting Report :[0m 

Mean Squared Error :  3659240718.47
RMSE :  60491.66
R2 Square :  0.7284215399934539

******************* [1mDecisionTreeRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error

In [18]:
y_test.describe()

count      4128.000000
mean     205656.130814
std      116091.504411
min       22500.000000
25%      118175.000000
50%      177750.000000
75%      262500.000000
max      500001.000000
Name: median_house_value, dtype: float64

## HyperParameter Tuning

###  Choosing best models

- Random forest classifier

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
random_forest = RandomForestRegressor()

In [41]:
Param_grid = [
                {
                    "max_depth": [10,11,12],
                    "n_estimators": [200,250,300]

                }
             ]

#### Various Scoring Parameters for Regression

    'neg_mean_absolute_error'   :     metrics.mean_absolute_error

    'neg_mean_squared_error'    :     metrics.mean_squared_error

    'neg_root_mean_squared_error'  :  metrics.mean_squared_error

    'neg_mean_squared_log_error'  :   metrics.mean_squared_log_error

    'neg_median_absolute_error'   :   metrics.median_absolute_error

    'r2'                          :   metrics.r2_score

In [42]:
def GridCV(Model , Param_Grid ,Scoring , CV ,X_train ,y_train ,X_test ,y_test):
    
    # Creating object of GridSearchCV class
    grid_search = GridSearchCV(estimator = Model, param_grid =  Param_Grid, 
                               scoring = Scoring , cv = CV)
    
    # Fitting to the object
    grid_search.fit(X_train.values , y_train)
    
    # Printing the Best parameters
    print("Grid Search CV Best Parameters : ",grid_search.best_params_)
    
    # Mean cross-validated score of the best_estimator
    print("Grid Search CV Best Score : ",grid_search.best_score_,"\n")
    
    # Best Fitted Model
    best_model = grid_search.best_estimator_
    
    # Prediction : Trian and Test
    y_hat_GSCV_train = best_model.predict(X_train.values)
    y_hat_GSCV_test = best_model.predict(X_test.values)

    print("\33[1mResults after HyperParameter Tuning\33[0m \n")
    print("\33[1mTraining Report :\33[0m \n")
    print("Mean Squared Error : ",round(mean_squared_error(y_train,y_hat_GSCV_train),2))
    print("RMSE : ",round(np.sqrt(mean_squared_error(y_train,y_hat_GSCV_train)),2))
    print("R2 Square : ",r2_score(y_train,y_hat_GSCV_train))
    print("\n")


    print("\33[1mTesting Report :\33[0m \n")
    print("Mean Squared Error : ",round(mean_squared_error(y_test,y_hat_GSCV_test),2))
    print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_hat_GSCV_test)),2))
    print("R2 Square : ",r2_score(y_test,y_hat_GSCV_test))

In [43]:
GridCV(Model = random_forest , Param_Grid =Param_grid ,Scoring = "r2" , CV = 4 ,
       X_train = X_train ,y_train = y_train ,X_test = X_test ,y_test = y_test)

Grid Search CV Best Parameters :  {'max_depth': 12, 'n_estimators': 250}
Grid Search CV Best Score :  0.7960574152629797 

[1mResults after HyperParameter Tuning[0m 

[1mTraining Report :[0m 

Mean Squared Error :  1174843647.05
RMSE :  34275.99
R2 Square :  0.9115024425048256


[1mTesting Report :[0m 

Mean Squared Error :  2515148051.48
RMSE :  50151.25
R2 Square :  0.8133328504296471
