In [3]:
import os 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
from math import sqrt


In [4]:
#loading the data 
raw_data = pd.read_csv("C:\\Users\\raunak\\OneDrive\\Desktop\\jupyter project\\Housing.csv")
raw_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [5]:
#checking for null values 
raw_data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [6]:
raw_data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [7]:
# Making categorical variables into numeric representation

new_raw_data = pd.get_dummies(raw_data, columns = ['prefarea','mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning','furnishingstatus'])
new_raw_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,prefarea_no,prefarea_yes,mainroad_no,mainroad_yes,...,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,False,True,False,True,...,False,True,False,True,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,True,...,False,True,False,True,False,False,True,True,False,False
2,12250000,9960,3,2,2,2,False,True,False,True,...,False,False,True,True,False,True,False,False,True,False
3,12215000,7500,4,2,2,3,False,True,False,True,...,False,False,True,True,False,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,False,False,True,...,True,False,True,True,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,True,False,False,True,...,False,False,True,True,False,True,False,False,False,True
541,1767150,2400,3,1,1,0,True,False,True,False,...,False,True,False,True,False,True,False,False,True,False
542,1750000,3620,2,1,1,0,True,False,False,True,...,False,True,False,True,False,True,False,False,False,True
543,1750000,2910,3,1,1,0,True,False,True,False,...,False,True,False,True,False,True,False,True,False,False


In [8]:
#feature importance (correlation matrix)
hm = new_raw_data[['price', 'area', 'bedrooms', 'bathrooms', 'stories']].corr()
hm

Unnamed: 0,price,area,bedrooms,bathrooms,stories
price,1.0,0.535997,0.366494,0.517545,0.420712
area,0.535997,1.0,0.151858,0.19382,0.083996
bedrooms,0.366494,0.151858,1.0,0.37393,0.408564
bathrooms,0.517545,0.19382,0.37393,1.0,0.326165
stories,0.420712,0.083996,0.408564,0.326165,1.0


In [9]:
# Split the data into X & y
x = new_raw_data.drop('price', axis = 1).values
y = new_raw_data['price']
print(x.shape)
print(y.shape)


(545, 20)
(545,)


In [10]:
dt = DecisionTreeClassifier(random_state=15, criterion  = 'entropy', max_depth = 10)
dt.fit(x,y)


In [11]:
# Running Feature Importance

for i, column in enumerate(new_raw_data.drop('price', axis = 1)):
    print('The feature importance for {} is: {:.3f}'.format(column, dt.feature_importances_[i]))

The feature importance for area is: 0.427
The feature importance for bedrooms is: 0.064
The feature importance for bathrooms is: 0.048
The feature importance for stories is: 0.075
The feature importance for parking is: 0.079
The feature importance for prefarea_no is: 0.014
The feature importance for prefarea_yes is: 0.018
The feature importance for mainroad_no is: 0.006
The feature importance for mainroad_yes is: 0.015
The feature importance for guestroom_no is: 0.023
The feature importance for guestroom_yes is: 0.010
The feature importance for basement_no is: 0.066
The feature importance for basement_yes is: 0.009
The feature importance for hotwaterheating_no is: 0.005
The feature importance for hotwaterheating_yes is: 0.002
The feature importance for airconditioning_no is: 0.012
The feature importance for airconditioning_yes is: 0.025
The feature importance for furnishingstatus_furnished is: 0.038
The feature importance for furnishingstatus_semi-furnished is: 0.054
The feature import

In [12]:
#Splitting the Raw Data - Hold-out validation
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.80, test_size = 0.2, random_state=15)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(436, 20)
(109, 20)
(436,)
(109,)


In [13]:
#linear regression model
lm = LinearRegression(fit_intercept = True)
lm.fit(x_train, y_train)


In [14]:
y_pred = lm.predict(x_train)

In [15]:
# Model Accuracy on training dataset
print('The Accuracy  on the training dataset is: ', lm.score(x_train, y_train) )

The Accuracy  on the training dataset is:  0.689698620524936


In [16]:
# Model Accuracy on testing dataset
print('The Accuracy  on the testing dataset is: ', lm.score(x_test, y_test) )


The Accuracy  on the testing dataset is:  0.6327202871541675


In [17]:
# Coefficients
print('Coefficients: ', lm.coef_ )


Coefficients:  [ 2.59728974e+02  1.39579178e+05  8.23295648e+05  5.04832842e+05
  2.18852821e+05 -3.16810047e+05  3.16810047e+05 -2.20542760e+05
  2.20542760e+05 -6.69201716e+04  6.69201716e+04 -1.98533933e+05
  1.98533933e+05 -4.10661021e+05  4.10661021e+05 -4.85835453e+05
  4.85835453e+05  1.50456849e+05  1.36284717e+05 -2.86741566e+05]


In [18]:
# The Intercept
print('Intercept: ', lm.intercept_)

Intercept:  1545218.6568504442


In [19]:
x2 = new_raw_data.drop('price', axis = 1)
x2.shape

(545, 20)

In [21]:
#Passing Coeff into a table
Coeff = lm.coef_
print(Coeff.shape) 


(20,)


In [23]:
Coeff = lm.coef_
Coeff_df = pd.DataFrame(Coeff.reshape(1, -1), columns=x2.columns)
Coeff_df

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,prefarea_no,prefarea_yes,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,259.728974,139579.178157,823295.648053,504832.842355,218852.820691,-316810.046857,316810.046857,-220542.760181,220542.760181,-66920.171626,66920.171626,-198533.932718,198533.932718,-410661.020906,410661.020906,-485835.453019,485835.453019,150456.848922,136284.717061,-286741.565983


In [24]:
x2

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,prefarea_no,prefarea_yes,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,7420,4,2,3,2,False,True,False,True,True,False,True,False,True,False,False,True,True,False,False
1,8960,4,4,4,3,True,False,False,True,True,False,True,False,True,False,False,True,True,False,False
2,9960,3,2,2,2,False,True,False,True,True,False,False,True,True,False,True,False,False,True,False
3,7500,4,2,2,3,False,True,False,True,True,False,False,True,True,False,False,True,True,False,False
4,7420,4,1,2,2,True,False,False,True,False,True,False,True,True,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,2,True,False,False,True,True,False,False,True,True,False,True,False,False,False,True
541,2400,3,1,1,0,True,False,True,False,True,False,True,False,True,False,True,False,False,True,False
542,3620,2,1,1,0,True,False,False,True,True,False,True,False,True,False,True,False,False,False,True
543,2910,3,1,1,0,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False


In [29]:
#prediction and the final model 
area = 200
bedrooms = 4
bathrooms = 2
stories = 3
parking = 2
pred1 = (lm.intercept_)+(Coeff_df['area'].values[0] * area)+(Coeff_df['bedrooms'].values[0] * bedrooms)+(Coeff_df['bathrooms'].values[0] * bathrooms)+(Coeff_df['stories'].values[0] * stories)+(Coeff_df['parking'].values[0] * parking) +(Coeff_df['prefarea_yes'].values[0] * 1)+(Coeff_df['mainroad_no'].values[0] * 1)+(Coeff_df['guestroom_no'].values[0] * 1)+(Coeff_df['basement_no'].values[0] * 1)+(Coeff_df['basement_yes'].values[0] * 1)+(Coeff_df['hotwaterheating_no'].values[0] * 1)+(Coeff_df['hotwaterheating_yes'].values[0] * 1)+(Coeff_df['airconditioning_no'].values[0] * 1)+(Coeff_df['airconditioning_yes'].values[0] * 1)+(Coeff_df['furnishingstatus_furnished'].values[0] * 1)+(Coeff_df['furnishingstatus_semi-furnished'].values[0] * 1)+(Coeff_df['furnishingstatus_unfurnished'].values[0] * 1)

In [30]:
print('The Price of the house is:', pred1)

The Price of the house is: 5783623.743785078
