In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv("bengaluru_house_prices.csv")
df.head(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.0


In [4]:
df.shape

(13320, 9)

### Data Preprocessing and Cleaning

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [7]:
df.society.unique()

array(['Coomee ', 'Theanmp', nan, ..., 'SJovest', 'ThhtsV ', 'RSntsAp'],
      shape=(2689,), dtype=object)

In [8]:
df.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [9]:
df.location.unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], shape=(1306,), dtype=object)

In [10]:
#Dropping Unessacery Columns
df.drop(['availability','society','area_type'],axis='columns',inplace=True)
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0
5,Whitefield,2 BHK,1170,2.0,1.0,38.0
6,Old Airport Road,4 BHK,2732,4.0,,204.0
7,Rajaji Nagar,4 BHK,3300,4.0,,600.0
8,Marathahalli,3 BHK,1310,3.0,1.0,63.25
9,Gandhi Bazar,6 Bedroom,1020,6.0,,370.0


### Handling NA values

In [11]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    12710 non-null  object 
 1   size        12710 non-null  object 
 2   total_sqft  12710 non-null  object 
 3   bath        12710 non-null  float64
 4   balcony     12710 non-null  float64
 5   price       12710 non-null  float64
dtypes: float64(3), object(3)
memory usage: 695.1+ KB


## Feature Engineering

In [12]:
#Size Column
df['size']=df['size'].apply(lambda x: int(str(x).split(' ')[0]))
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4,2600,5.0,3.0,120.0
2,Uttarahalli,3,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3,1521,3.0,1.0,95.0
4,Kothanur,2,1200,2.0,1.0,51.0
5,Whitefield,2,1170,2.0,1.0,38.0
8,Marathahalli,3,1310,3.0,1.0,63.25
10,Whitefield,3,1800,2.0,2.0,70.0
11,Whitefield,4,2785,5.0,3.0,295.0
12,7th Phase JP Nagar,2,1000,2.0,1.0,38.0


In [13]:
df['size'].unique()

array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13])

In [14]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [15]:
df[~df['total_sqft'].apply(is_float)].count()

location      186
size          186
total_sqft    186
bath          186
balcony       186
price         186
dtype: int64

In [16]:
df[~df['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,balcony,price
30,Yelahanka,4,2100 - 2850,4.0,0.0,186.0
122,Hebbal,4,3067 - 8156,4.0,0.0,477.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,0.0,54.005
165,Sarjapur,2,1145 - 1340,2.0,0.0,43.49
188,KR Puram,2,1015 - 1540,2.0,0.0,56.8
410,Kengeri,1,34.46Sq. Meter,1.0,0.0,18.5
549,Hennur Road,2,1195 - 1440,2.0,0.0,63.77
661,Yelahanka,2,1120 - 1145,2.0,0.0,48.13
672,Bettahalsoor,4,3090 - 5002,4.0,0.0,445.0
772,Banashankari Stage VI,2,1160 - 1195,2.0,0.0,59.935


In [17]:
df[~df['total_sqft'].apply(is_float)].sample(10)


Unnamed: 0,location,size,total_sqft,bath,balcony,price
927,Thanisandra,2,1000 - 1285,2.0,0.0,43.415
13265,Hoodi,2,1133 - 1384,2.0,0.0,59.135
8724,Varthur Road,1,540 - 565,1.0,0.0,13.26
12161,Kanakapura,1,712 - 938,1.0,0.0,35.475
10569,Devanahalli,2,1230 - 1490,2.0,0.0,62.425
12652,Billamaranahalli,2,300Sq. Yards,2.0,2.0,150.0
9183,Hormavu,2,943 - 1220,2.0,0.0,38.665
1484,Hebbal,2,547.34 - 827.31,2.0,0.0,42.72
5887,Rajaji Nagar,4,2563 - 2733,5.0,0.0,251.5
12955,Thanisandra,3,1437 - 1629,3.0,0.0,75.885


In [18]:
def transform1(x):
    x=str(x)
    try:
        if is_float(x)==True:
            x=float(x)
        if 'Sq. Yards' in x :
            x=float(x.replace('Sq. Yards',""))*9.0
        if 'Sq. Meter' in x :
            x=float(x.replace('Sq. Meter',""))*10.7639
        if 'Acres' in x :
            x=float(x.replace('Acres',""))*43560
        if 'Grounds' in x :
            x=float(x.replace('Grounds',""))*2400
        if 'Cents' in x :
            x=float(x.replace('Cents',""))*435.56
        if 'Guntha' in x :
            x=float(x.replace('Guntha',""))*1089
        if '-' in x:
            lst=x.split()
            x=float(lst[0])+float(lst[2])
        return x
    except:
        return x

In [19]:
df['total_sqft']=df['total_sqft'].apply(transform1)
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0
2,Uttarahalli,3,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0
4,Kothanur,2,1200.0,2.0,1.0,51.0
5,Whitefield,2,1170.0,2.0,1.0,38.0
8,Marathahalli,3,1310.0,3.0,1.0,63.25
10,Whitefield,3,1800.0,2.0,2.0,70.0
11,Whitefield,4,2785.0,5.0,3.0,295.0
12,7th Phase JP Nagar,2,1000.0,2.0,1.0,38.0


In [20]:
df[~df['total_sqft'].apply(is_float)].count()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [21]:
df['Price_per_sqft']=df.price*100000/df.total_sqft
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,Price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,1.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,3.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,1.0,51.0,4250.0
5,Whitefield,2,1170.0,2.0,1.0,38.0,3247.863248
8,Marathahalli,3,1310.0,3.0,1.0,63.25,4828.244275
10,Whitefield,3,1800.0,2.0,2.0,70.0,3888.888889
11,Whitefield,4,2785.0,5.0,3.0,295.0,10592.459605
12,7th Phase JP Nagar,2,1000.0,2.0,1.0,38.0,3800.0


In [22]:
len(df.location.unique())

1265

In [24]:
df.location=df.location.apply(lambda x:x.strip())
locs=df.groupby('location')['location'].agg('count').sort_values(ascending=False)
locs

location
Whitefield             515
Sarjapur  Road         372
Electronic City        302
Kanakpura Road         261
Thanisandra            234
                      ... 
Whietfield,              1
Whitefield ECC Road      1
Williams Town            1
Xavier Layout            1
Viviani Road             1
Name: location, Length: 1254, dtype: int64

In [25]:
len(locs[locs <= 10])

1017

In [26]:
locs_10=locs[locs <= 10]
locs_10

location
Ganga Nagar              10
1st Block Koramangala    10
Basapura                 10
Nagappa Reddy Layout     10
Naganathapura            10
                         ..
Whietfield,               1
Whitefield ECC Road       1
Williams Town             1
Xavier Layout             1
Viviani Road              1
Name: location, Length: 1017, dtype: int64

In [27]:
len(df.location.unique())

1254

In [28]:
df['location']=df['location'].apply(lambda x: 'Others' if x in locs_10 else x)
len(df.location.unique())

238

From Knowledge about homes and housing market, we can find outliers 
1. A BHK must have more than 300 sq foot area 

In [29]:
df[df['total_sqft']/df['size'] < 300].head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,Price_per_sqft
58,Murugeshpalya,6,1407.0,4.0,1.0,150.0,10660.98081
68,Devarachikkanahalli,8,1350.0,7.0,0.0,85.0,6296.296296
70,Others,3,500.0,3.0,2.0,100.0,20000.0
78,Kaval Byrasandra,2,460.0,1.0,0.0,22.0,4782.608696
89,Rajaji Nagar,6,710.0,6.0,3.0,160.0,22535.211268
119,Hennur Road,2,276.0,3.0,3.0,23.0,8333.333333
129,Vishwapriya Layout,7,950.0,7.0,0.0,115.0,12105.263158
170,Others,6,1300.0,6.0,0.0,99.0,7615.384615
176,Kumaraswami Layout,5,600.0,3.0,2.0,85.0,14166.666667
193,Others,7,1800.0,7.0,1.0,250.0,13888.888889


In [30]:
df[df['total_sqft']/df['size'] < 300].count()

location          659
size              659
total_sqft        659
bath              659
balcony           659
price             659
Price_per_sqft    659
dtype: int64

In [31]:
df.shape

(12710, 7)

In [32]:
df2=df[df['total_sqft']/df['size'] > 300]
df2.shape

(11885, 7)

In [33]:
df2.Price_per_sqft.describe()

count     11885.000000
mean       6083.190757
std        3912.375558
min           2.257423
25%        4137.492043
50%        5208.333333
75%        6736.068585
max      176470.588235
Name: Price_per_sqft, dtype: float64

In [36]:
def remove_outliers(df):
    df_new=pd.DataFrame()
    for key , subdf in df.groupby('location'):
        m=np.mean(subdf.Price_per_sqft)
        std=np.std(subdf.Price_per_sqft)
        out_df=subdf[(subdf.Price_per_sqft > (m-std)) & (subdf.Price_per_sqft < (m+std))]
        df_new=pd.concat([df_new,out_df],ignore_index=True)
    return df_new
df3=remove_outliers(df2)
df3.shape

(9692, 7)