In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Import Transforrmed Dataset

In [2]:
filepath_train = r"S1_Part2_Pipeline_Train_DS.csv"
filepath_test = r"S1_Part2_Pipeline_Test_DS.csv"

In [3]:
d_train = pd.read_csv(filepath_train  ,index_col= 0 , header= 0)
d_test = pd.read_csv(filepath_test  ,index_col= 0 , header= 0)

In [4]:
d_train.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-122.7,39.14,13.0,532.0,111.0,214.0,62.0,3.3929,0.0,1.0,0.0,0.0,0.0,108300.0
1,-120.83,35.32,11.0,3252.0,701.0,1814.0,660.0,3.2226,0.0,0.0,0.0,0.0,1.0,183200.0
2,-122.31,40.49,18.0,4026.0,718.0,1731.0,705.0,3.35,0.0,1.0,0.0,0.0,0.0,118400.0


In [5]:
d_test.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-118.1,33.81,36.0,1111.0,184.0,444.0,177.0,3.7031,1.0,0.0,0.0,0.0,0.0,245300.0
1,-122.27,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,0.0,0.0,0.0,1.0,0.0,104200.0
2,-119.12,35.37,13.0,4527.0,713.0,2170.0,671.0,4.8266,0.0,1.0,0.0,0.0,0.0,146200.0


## STEP 2 :  Train - Test Split

In [6]:
X_train = d_train.drop(labels= ['median_house_value'] , axis = 1)
y_train = d_train['median_house_value']

X_test = d_test.drop(labels= ['median_house_value'] , axis = 1)
y_test = d_test['median_house_value']

In [7]:
X_train.shape , y_train.shape

((16512, 13), (16512,))

In [8]:
X_test.shape , y_test.shape

((4128, 13), (4128,))

## STEP 3 : Training models

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.neighbors import KNeighborsRegressor

In [10]:
from sklearn.metrics import mean_squared_error,r2_score

### Various Algorithms for Regression Task : 

- Linear Regression, Polynomial Regression, Ridge Regression, Lasso Regression, Elastic Net Regression.
- Support Vector Regression (SVR),
- Decision Tree Regression, 
- Random Forest Regression, and 
- Gradient Boosting Regression.

In [11]:
models = [LinearRegression(), SVR(),KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), ]

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
def ModelTraining(ML_models , X_train ,y_train ,X_test,y_test):
    
    for model in ML_models :

        model.fit(X_train.values, y_train)
        y_hat_train = model.predict(X_train.values)
        y_hat_test = model.predict(X_test.values)

        print(f"******************* \33[1m{str(model)}\33[0m *******************\n")

        print("\33[1mTraining Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_train,y_hat_train),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_train,y_hat_train)),2))
        print("R2 Square : ",r2_score(y_train,y_hat_train))

        print() 

        print("\33[1mTesting Report :\33[0m \n")
        print("Mean Squared Error : ",round(mean_squared_error(y_test,y_hat_test),2))
        print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_hat_test)),2))
        print("R2 Square : ",r2_score(y_test,y_hat_test))


        print()

In [14]:
ModelTraining(ML_models = models ,  X_train = X_train ,y_train = y_train ,
                   X_test = X_test, y_test = y_test)

******************* [1mLinearRegression()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  4775955255.43
RMSE :  69108.29
R2 Square :  0.6402411709241336

[1mTesting Report :[0m 

Mean Squared Error :  4506155081.56
RMSE :  67127.9
R2 Square :  0.6655659597845212

******************* [1mSVR()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  13969690049.44
RMSE :  118193.44
R2 Square :  -0.05229615144043831

[1mTesting Report :[0m 

Mean Squared Error :  14090324114.3
RMSE :  118702.67
R2 Square :  -0.04574386282800047

******************* [1mKNeighborsRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Error :  6479280187.55
RMSE :  80493.98
R2 Square :  0.5119346541451519

[1mTesting Report :[0m 

Mean Squared Error :  10226267558.05
RMSE :  101125.01
R2 Square :  0.24103544732367232

******************* [1mDecisionTreeRegressor()[0m *******************

[1mTraining Report :[0m 

Mean Squared Er

In [15]:
y_test.describe()

count      4128.000000
mean     205656.130814
std      116091.504411
min       22500.000000
25%      118175.000000
50%      177750.000000
75%      262500.000000
max      500001.000000
Name: median_house_value, dtype: float64