In [1]:
# imprting necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import pickle

In [2]:
data = pd.read_csv('bengaluru_house_prices.csv')

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
data.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

In [7]:
data['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [8]:
data['availability'].value_counts()

Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64

In [9]:
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [10]:
data['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [11]:
data['society'].value_counts()

GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Amionce     1
JaghtDe     1
Jauraht     1
Brity U     1
RSntsAp     1
Name: society, Length: 2688, dtype: int64

In [12]:
area_type = data['area_type'].values.reshape(-1,1)

In [13]:
# Encoding "area_type"

oe = OrdinalEncoder(categories=[['Super built-up  Area','Built-up  Area','Plot  Area','Carpet  Area']])
data['area_type']=oe.fit_transform(area_type)
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0.0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2.0,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1.0,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0.0,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0.0,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [14]:
data['size'] = data['size'].fillna('2 BHK')

In [15]:
data['size'] = data['size'].str.split().str.get(0).astype(int)
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0.0,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,2.0,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,1.0,Ready To Move,Uttarahalli,3,,1440,2.0,3.0,62.0
3,0.0,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,0.0,Ready To Move,Kothanur,2,,1200,2.0,1.0,51.0


In [16]:
def preprocess_sqft(value):
    if '-' in value:
        start,end = value.split('-')
        start = float(start.strip())
        end = float(end.strip())
        return (start+end)/2
    else :
        try:
            return float(value)
        except ValueError:
            return None

In [17]:
data['total_sqft']=data['total_sqft'].apply(lambda x: preprocess_sqft(x))

In [18]:
data.dtypes

area_type       float64
availability     object
location         object
size              int32
society          object
total_sqft      float64
bath            float64
balcony         float64
price           float64
dtype: object

In [19]:
data['total_sqft'] = data['total_sqft'].fillna(data['bath'].mean())
data['bath'] = data['bath'].fillna(data['bath'].mean())
data['balcony'] = data['balcony'].fillna(data['balcony'].mean())

In [20]:
X = data.drop(columns=['availability','location','society','price'])
y = data['price']

In [21]:
# splitting dataset for training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [22]:
# Linear Regression model

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [23]:
lr_pred = lr.predict(X_test)

In [24]:
print('Root mean squared value:',np.sqrt(mean_squared_error(lr_pred, y_test)))

Root mean squared value: 110.05293350500133


In [25]:
# Decision Tree Regressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

DecisionTreeRegressor()

In [26]:
dtr_pred = dtr.predict(X_test)

In [27]:
print('Root mean squared value:',np.sqrt(mean_squared_error(dtr_pred, y_test)))

Root mean squared value: 121.25087973469887


In [28]:
# SVR

svr = SVR()
svr.fit(X_train, y_train)

SVR()

In [29]:
svr_pred = svr.predict(X_test)

In [30]:
print('Root mean squared value:',np.sqrt(mean_squared_error(svr_pred, y_test)))

Root mean squared value: 126.82383555405417


In [31]:
# Loading model into pickle
pickle.dump(lr,open('model.pkl','wb'))