### Problem Statement
* Predict the Price of houses based on available Dataset

- Linear Regression Model will be used since we are predicting a continuos output value

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (LinearRegression, LassoCV, RidgeCV)
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score)

In [2]:
# Load Dataset
df = pd.read_csv("../DATASET/house_price_data.csv")
df.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
0,3,2.0,150.0,1672.0,3,8971.291866
1,3,3.0,149.0,1750.0,3,8514.285714
2,3,2.0,150.0,1750.0,3,8571.428571
3,2,2.0,40.0,1250.0,2,3200.0
4,2,2.0,83.0,1200.0,2,6916.666667


##### Perform Exploratory Data Analysis

In [3]:
# Check dimension of dataset
df.shape

(7120, 6)

In [4]:
# check columns
df.columns

Index(['bath', 'balcony', 'price', 'total_sqft_int', 'bhk', 'price_per_sqft'], dtype='object')

In [5]:
# Check summary info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7120 entries, 0 to 7119
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bath            7120 non-null   int64  
 1   balcony         7120 non-null   float64
 2   price           7120 non-null   float64
 3   total_sqft_int  7120 non-null   float64
 4   bhk             7120 non-null   int64  
 5   price_per_sqft  7120 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 333.9 KB


In [6]:
# Check summary statistics
df.describe()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
count,7120.0,7120.0,7120.0,7120.0,7120.0,7120.0
mean,2.390871,1.572759,96.454991,1479.729806,2.465169,5923.806855
std,0.876822,0.770583,116.185034,913.779769,0.84152,2556.650935
min,1.0,0.0,10.0,350.0,1.0,1250.0
25%,2.0,1.0,49.23,1100.0,2.0,4416.761042
50%,2.0,2.0,69.0,1255.0,2.0,5417.855612
75%,3.0,2.0,104.0,1640.25,3.0,6618.285651
max,9.0,3.0,2912.0,30400.0,9.0,35000.0


In [7]:
# Check null values
df.isna().any()

bath              False
balcony           False
price             False
total_sqft_int    False
bhk               False
price_per_sqft    False
dtype: bool

In [8]:
# Check for duplicated values or row
df.loc[df.duplicated(subset=['bath', 'balcony', 'price', 'total_sqft_int', 'bhk', 'price_per_sqft'])]

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
58,3,2.0,90.00,1480.0,3,6081.081081
69,2,2.0,47.25,1350.0,3,3500.000000
72,2,2.0,47.25,1350.0,3,3500.000000
73,2,1.0,40.25,1150.0,2,3500.000000
111,2,1.0,62.80,1256.0,2,5000.000000
...,...,...,...,...,...,...
7073,2,2.0,45.60,1200.0,2,3800.000000
7075,2,2.0,44.08,1160.0,2,3800.000000
7107,3,3.0,140.00,1855.0,3,7547.169811
7118,2,1.0,55.00,1000.0,2,5500.000000


In [9]:
# drop duplicates
new_df = df.drop_duplicates(keep='first')
new_df.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
0,3,2.0,150.0,1672.0,3,8971.291866
1,3,3.0,149.0,1750.0,3,8514.285714
2,3,2.0,150.0,1750.0,3,8571.428571
3,2,2.0,40.0,1250.0,2,3200.0
4,2,2.0,83.0,1200.0,2,6916.666667


In [10]:
# Check for duplicate rows on new dataframe
new_df.duplicated(subset=['bath', 'balcony', 'price', 'total_sqft_int', 'bhk', 'price_per_sqft'])

0       False
1       False
2       False
3       False
4       False
        ...  
7113    False
7114    False
7115    False
7116    False
7117    False
Length: 6282, dtype: bool

In [11]:
# check summaary statistics
new_df.describe()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
count,6282.0,6282.0,6282.0,6282.0,6282.0,6282.0
mean,2.422159,1.577981,100.468874,1510.115153,2.496975,6036.373686
std,0.906067,0.786136,122.3535,959.170042,0.867467,2644.838512
min,1.0,0.0,10.0,350.0,1.0,1250.0
25%,2.0,1.0,50.0,1100.0,2.0,4491.202321
50%,2.0,2.0,70.0,1280.0,2.0,5500.0
75%,3.0,2.0,110.0,1680.0,3.0,6696.384953
max,9.0,3.0,2912.0,30400.0,9.0,35000.0


In [12]:
# Check correlation
new_df.corr().T

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
bath,1.0,0.272286,0.517204,0.64655,0.882972,0.33781
balcony,0.272286,1.0,0.132035,0.190504,0.25913,0.060735
price,0.517204,0.132035,1.0,0.795581,0.490886,0.715079
total_sqft_int,0.64655,0.190504,0.795581,1.0,0.640613,0.367155
bhk,0.882972,0.25913,0.490886,0.640613,1.0,0.309464
price_per_sqft,0.33781,0.060735,0.715079,0.367155,0.309464,1.0


#### Model Training

In [13]:
# Separate dataset
x = new_df.drop(columns=['price'], axis=1)
y = new_df['price']

In [14]:
# Split Dataset
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [15]:
# feature scaling
st = StandardScaler()
x_train = st.fit_transform(x_train)
x_test = st.fit_transform(x_test)

In [16]:
# train model
regressor = LinearRegression()
regressor.fit(x_train,y_train)

In [17]:
# print coefficient and intercept
print("Intercept: ",regressor.intercept_)
print("Coefficient: ",regressor.coef_)

Intercept:  101.69266268656715
Coefficient:  [-1.65302160e+00 -6.27136933e-02  8.70992146e+01 -1.19574495e+01
  6.46652453e+01]


In [18]:
# predict with trained model
y_pred = regressor.predict(x_test)

In [19]:
# Compare Actual vs Predicted Value
pred_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
pred_df.head()

Unnamed: 0,Actual,Predicted
3443,110.0,185.489253
4775,65.0,55.409725
3763,35.0,-9.608763
6992,175.0,214.796746
3015,45.0,18.495092


#### Model Evaluation

In [20]:
# Model evaluation based on testing data
t_mean_absolute_err = mean_absolute_error(y_test,y_pred)
t_mean_squared_err = mean_squared_error(y_test,y_pred)
t_root_mean_sqr_err = np.sqrt(mean_squared_error(y_test,y_pred))
t_r_sqrd_score = r2_score(y_test,y_pred)

print("Mean Absolute Error: ",t_mean_absolute_err)
print("Mean Squared Error: ",t_mean_squared_err)
print("Root Mean Squared Error: ",t_root_mean_sqr_err)
print("R Squared Score: ",t_r_sqrd_score)

Mean Absolute Error:  27.605609611157806
Mean Squared Error:  1738.740995922244
Root Mean Squared Error:  41.6982133420875
R Squared Score:  0.773472404694659


In [21]:
# Model Evaluation based on training data
train_pred = regressor.predict(x_train)

x_mean_absolute_err = mean_absolute_error(y_train,train_pred)
x_mean_squared_err = mean_squared_error(y_train,train_pred)
x_root_mean_sqr_err = np.sqrt(mean_squared_error(y_train,train_pred))
x_r_sqrd_score = r2_score(y_train,train_pred)

print("Mean Absolute Error: ",x_mean_absolute_err)
print("Mean Squared Error: ",x_mean_squared_err)
print("Root Mean Squared Error: ",x_root_mean_sqr_err)
print("R Squared Score: ",x_r_sqrd_score)

Mean Absolute Error:  20.393584322755856
Mean Squared Error:  2680.191956629029
Root Mean Squared Error:  51.77057037187275
R Squared Score:  0.8403192671576583


In [22]:
# Variance = (Bais of Trianing) - (Bais of Testing)
variance = x_root_mean_sqr_err - t_root_mean_sqr_err
print("Variance: ", variance)

Variance:  10.072357029785252


#### Perform Regularization using LassoCV and RidgeCV

- LassocV

In [23]:
# Implementing LassoCV

alphas = [0.01, 0.1, 1.0, 10]
lasso = LassoCV(alphas=alphas)

In [24]:
# train model
lasso.fit(x_train,y_train)

In [28]:
# Predict with model
lasso_pred = lasso.predict(x_test)

In [29]:
# Model Evaluation Metrics
l_mean_abs_err = mean_absolute_error(y_test,lasso_pred)
l_mean_sqrd_error = mean_squared_error(y_test,lasso_pred)
l_root_mean_sqrd_err = np.sqrt(mean_squared_error(y_test,lasso_pred))
l_r_squared = r2_score(y_test,lasso_pred)b

print("Mean Abs Error :", l_mean_abs_err)
print("Mean Sqrd Error :", l_mean_sqrd_error)
print("Root Mean Sqrd Error :", l_root_mean_sqrd_err)
print("R Squared :", l_r_squared)

Mean Abs Error : 27.19143168167381
Mean Sqrd Error : 1637.0829258506017
Root Mean Sqrd Error : 40.46088142701048
R Squared : 0.786716676389361


In [31]:
# Model Evaluation metrics on Training Data

# Predict with model on training dataset
lasso_t_pred = lasso.predict(x_train)

lt_mean_abs_err = mean_absolute_error(y_train,lasso_t_pred)
lt_mean_sqrd_error = mean_squared_error(y_train,lasso_t_pred)
lt_root_mean_sqrd_err = np.sqrt(mean_squared_error(y_train,lasso_t_pred))
lt_r_squared = r2_score(y_train,lasso_t_pred)

print("Mean Abs Error :", lt_mean_abs_err)
print("Mean Sqrd Error :", lt_mean_sqrd_error)
print("Root Mean Sqrd Error :", lt_root_mean_sqrd_err)
print("R Squared :", lt_r_squared)

Mean Abs Error : 20.083617431810715
Mean Sqrd Error : 2687.1040209924267
Root Mean Sqrd Error : 51.83728408194653
R Squared : 0.8399074595256448


In [32]:
# Variance on LassoCV model
variance = lt_root_mean_sqrd_err - l_root_mean_sqrd_err
print("LassoCV Variance :", variance)

LassoCV Variance : 11.37640265493605


- RidgeCV

In [33]:
alphas = [0.01, 0.1, 1.0, 10]
ridge = RidgeCV(alphas=alphas)

# train model
ridge.fit(x_train,y_train)

In [35]:
# predict on testing data
ridge_predict = ridge.predict(x_test)

# Model Evaluation
ridge_t_mean_abs_err = mean_absolute_error(y_test,ridge_predict)
ridge_t_mean_sqrd_error = mean_squared_error(y_test,ridge_predict)
ridge_t_root_mean_sqrd_err = np.sqrt(mean_squared_error(y_test,ridge_predict))
ridge_t_r_squared = r2_score(y_test,ridge_predict)

print("Mean Abs Error :", ridge_t_mean_abs_err)
print("Mean Sqrd Error :", ridge_t_mean_sqrd_error)
print("Root Mean Sqrd Error :", ridge_t_root_mean_sqrd_err)
print("R Squared :", ridge_t_r_squared)

Mean Abs Error : 27.52777162987131
Mean Sqrd Error : 1724.8041376035492
Root Mean Sqrd Error : 41.53076134148698
R Squared : 0.7752881340117047


In [36]:
# predict on training data
ridge_train_predict = ridge.predict(x_train)

# model evaluation
ridge_lt_mean_abs_err = mean_absolute_error(y_train,ridge_train_predict)
ridge_lt_mean_sqrd_error = mean_squared_error(y_train,ridge_train_predict)
ridge_lt_root_mean_sqrd_err = np.sqrt(mean_squared_error(y_train,ridge_train_predict))
ridge_lt_r_squared = r2_score(y_train,ridge_train_predict)

print("Mean Abs Error :", ridge_lt_mean_abs_err)
print("Mean Sqrd Error :", ridge_lt_mean_sqrd_error)
print("Root Mean Sqrd Error :", ridge_lt_root_mean_sqrd_err)
print("R Squared :", ridge_lt_r_squared)

Mean Abs Error : 20.332209607580964
Mean Sqrd Error : 2680.2601629351207
Root Mean Sqrd Error : 51.77122910396392
R Squared : 0.8403152035558278


In [37]:
# Variance on RidgeCV model
variance = ridge_lt_root_mean_sqrd_err - ridge_t_root_mean_sqrd_err
print("RidgeCV Variance :", variance)

RidgeCV Variance : 10.24046776247694
