In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

In [2]:
df1 = pd.read_csv("pune_home_prices.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df1.shape

(13320, 9)

In [4]:
df1.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [5]:
df1.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [6]:
df1.count()

area_type       13320
availability    13320
location        13319
size            13304
society          7818
total_sqft      13320
bath            13247
balcony         12711
price           13320
dtype: int64

In [7]:

df1['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [8]:

df1['balcony'].value_counts()

2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: balcony, dtype: int64

In [9]:
##Drop features that are not required to build our model

In [10]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis=1)
df2.shape

(13320, 5)

In [11]:
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
#Data Cleaning: Handle NA values

In [13]:
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [14]:
df2[df2.location.isnull()]               ## this is null value in location that we need to drop

Unnamed: 0,location,size,total_sqft,bath,price
568,,3 BHK,1600,3.0,86.0


In [15]:
df2[df2['size'].isnull()]                   ## this is null value in size that we need to drop

Unnamed: 0,location,size,total_sqft,bath,price
579,Sarjapur Road,,1200 - 2400,,34.185
1775,IVC Road,,2000 - 5634,,124.0
2264,Banashankari,,2400,,460.0
2809,Sarjapur Road,,1200 - 2400,,28.785
2862,Devanahalli,,1500 - 2400,,46.8
5333,Devanahalli,,2100 - 5405,,177.115
6423,Whitefield,,2324,,26.73
6636,Jigani,,1500,,25.49
6719,Hoskote,,800 - 2660,,28.545
7680,Kasavanhalli,,5000,,400.0


In [16]:
df3 = df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [17]:
df3.shape

(13246, 5)

In [18]:
#Feature Engineering
##Add new feature(integer) for bhk (Bedrooms Hall Kitchen)

In [19]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [21]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [22]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.bhk.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [27]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [25]:
df3[df3.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [None]:
##first need to explore the total_sqft value then we will takel bhk column

In [26]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [28]:
df3['total_sqft'].value_counts()

1200       843
1100       221
1500       204
2400       195
600        180
          ... 
2406         1
3025         1
2282         1
1915         1
1623.29      1
Name: total_sqft, Length: 2067, dtype: int64

In [29]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [30]:
df3[~df3['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [42]:
df3[~df3['total_sqft'].apply(lambda x : False if len(x)>4 else True)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
44,Kanakpura Road,2 BHK,1330.74,2.0,91.79,2
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
142,Kasavanhalli,3 BHK,1563.05,3.0,105.0,3
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
373,Gopalapura,3 BHK,2023.71,3.0,275.0,3
393,Electronics City Phase 1,2 BHK,1113.27,2.0,53.0,2
408,Rajaji Nagar,7 BHK,12000,6.0,2200.0,7


In [43]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [47]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3


In [48]:
df4.loc[30]

location      Yelahanka
size              4 BHK
total_sqft         2475
bath                  4
price               186
bhk                   4
Name: 30, dtype: object

In [59]:
df4.iloc[44]

location      Kanakpura Road
size                   2 BHK
total_sqft           1330.74
bath                       2
price                  91.79
bhk                        2
Name: 44, dtype: object

In [60]:
#Feature Engineering
##Add new feature called price per square feet

In [123]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']       ## convert the price into price per sqft
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [124]:
df5_stats = df5['price_per_sqft'].describe()
df5_stats

count    1.320000e+04
mean     7.920759e+03
std      1.067272e+05
min      2.678298e+02
25%      4.267701e+03
50%      5.438331e+03
75%      7.317073e+03
max      1.200000e+07
Name: price_per_sqft, dtype: float64

In [125]:
df5_stats.count()

8

In [126]:
##Examine locations which is a categorical variable. We need to apply dimensionality 
##reduction technique here to reduce number of locations

In [127]:
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [128]:
loc_remove_space=df5.location.apply(lambda x: x.strip())
loc_remove_space

0        Electronic City Phase II
1                Chikka Tirupathi
2                     Uttarahalli
3              Lingadheeranahalli
4                        Kothanur
                   ...           
13315                  Whitefield
13316               Richards Town
13317       Raja Rajeshwari Nagar
13318             Padmanabhanagar
13319                Doddathoguru
Name: location, Length: 13200, dtype: object

In [129]:
loc_val = []
for i in loc_remove_space.unique():
    loc_val.append(i)

In [130]:
print(len(loc_val))
print(df5.shape)
#print(loc_val)

1287
(13200, 7)


In [131]:
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)
location_stats

Whitefield           533
Sarjapur  Road       392
Electronic City      304
Kanakpura Road       264
Thanisandra          235
                    ... 
Adarsh Nagar           1
Kannur                 1
Craig Park Layout      1
Tilak Nagar            1
Geetanjali Layout      1
Name: location, Length: 1287, dtype: int64

In [140]:
location_stats1 = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats1

location
Whitefield              533
Sarjapur  Road          392
Electronic City         304
Kanakpura Road          264
Thanisandra             235
                       ... 
Kumbhena Agrahara         1
Kudlu Village,            1
Konappana Agrahara        1
Kodanda Reddy Layout      1
1 Annasandrapalya         1
Name: location, Length: 1287, dtype: int64

In [141]:
location_stats.values.sum()

13200

In [142]:
len(location_stats)

1287

In [143]:
len(location_stats[location_stats>10])

240

In [144]:
len(location_stats[location_stats<=10])

1047

In [145]:
#Dimensionality Reduction
#Any location having less than 10 data points should be tagged as "other" location. This way number of categories 
#can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [146]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

Sector 1 HSR Layout     10
Kalkere                 10
BTM 1st Stage           10
Thyagaraja Nagar        10
Nagappa Reddy Layout    10
                        ..
Adarsh Nagar             1
Kannur                   1
Craig Park Layout        1
Tilak Nagar              1
Geetanjali Layout        1
Name: location, Length: 1047, dtype: int64

In [147]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

241

In [148]:
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [149]:
df5[df5.location=='other']

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.00,6,36274.509804
18,other,3 BHK,2770.0,4.0,290.00,3,10469.314079
19,other,2 BHK,1100.0,2.0,48.00,2,4363.636364
25,other,3 BHK,1250.0,3.0,56.00,3,4480.000000
42,other,1 BHK,600.0,1.0,38.00,1,6333.333333
...,...,...,...,...,...,...,...
13291,other,1 Bedroom,812.0,1.0,26.00,1,3201.970443
13292,other,3 BHK,1440.0,2.0,63.93,3,4439.583333
13302,other,2 BHK,1075.0,2.0,48.00,2,4465.116279
13306,other,4 Bedroom,1200.0,5.0,325.00,4,27083.333333


In [150]:
#Outlier Removal Using Business Logic
##As a data scientist when you have a conversation with your business manager (who has expertise in real estate), 
##he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. 
##If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. 
##We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft

In [152]:
df5[(df5.total_sqft/df5.bhk)<300].head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.98081
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.0


In [153]:
#Check above data points. We have 6 bhk apartment with 1020 sqft. Another one is 8 bhk and total sqft is 600. 
#These are clear data errors that can be removed safely

In [156]:
wrong_bhk = df5[(df5.total_sqft/df5.bhk)<300]
wrong_bhk.shape

(744, 7)

In [157]:
wrong_bhk

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.980810
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.000000
...,...,...,...,...,...,...,...
13277,other,7 Bedroom,1400.0,7.0,218.0,7,15571.428571
13279,other,6 Bedroom,1200.0,5.0,130.0,6,10833.333333
13281,Margondanahalli,5 Bedroom,1375.0,5.0,125.0,5,9090.909091
13303,Vidyaranyapura,5 Bedroom,774.0,5.0,70.0,5,9043.927649


In [166]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

(12456, 7)

In [167]:
df6.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [168]:
#Outlier Removal Using Standard Deviation and Mean

In [169]:
df6.price_per_sqft.describe()         ## Here no any home price would be 267 INR per sqft sso we should remove this one

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [170]:
##Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. 
##We should remove outliers per location using mean and one standard deviation

In [184]:
log_group=df6.groupby('location')
log_group.min()

Unnamed: 0_level_0,size,total_sqft,bath,price,bhk,price_per_sqft
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1st Block Jayanagar,2 BHK,1000.0,2.0,60.0,2,6000.000000
1st Phase JP Nagar,1 BHK,840.0,2.0,50.0,1,5274.725275
2nd Phase Judicial Layout,1 BHK,525.0,1.0,26.0,1,3500.000000
2nd Stage Nagarbhavi,4 Bedroom,1200.0,3.0,170.0,4,5666.666667
5th Block Hbr Layout,2 BHK,1100.0,2.0,48.0,2,3611.111111
...,...,...,...,...,...,...
Yelahanka,1 BHK,567.0,1.0,14.0,1,371.428571
Yelahanka New Town,1 BHK,350.0,1.0,13.5,1,1875.000000
Yelenahalli,2 BHK,1056.0,2.0,33.0,2,3125.000000
Yeshwanthpur,1 BHK,400.0,1.0,32.0,1,3200.000000


In [185]:
log_group.max()

Unnamed: 0_level_0,size,total_sqft,bath,price,bhk,price_per_sqft
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1st Block Jayanagar,4 Bedroom,2850.0,4.0,450.0,4,18750.000000
1st Phase JP Nagar,5 Bedroom,4550.0,7.0,400.0,5,25000.000000
2nd Phase Judicial Layout,3 BHK,1681.0,3.0,69.0,3,5000.000000
2nd Stage Nagarbhavi,6 Bedroom,4000.0,8.0,451.0,6,20416.666667
5th Block Hbr Layout,6 BHK,5100.0,5.0,300.0,6,17083.333333
...,...,...,...,...,...,...
Yelahanka,6 Bedroom,35000.0,6.0,800.0,6,19875.776398
Yelahanka New Town,4 Bedroom,4000.0,3.0,899.0,4,22475.000000
Yelenahalli,3 BHK,1650.0,3.0,68.0,3,5176.876618
Yeshwanthpur,6 Bedroom,3600.0,5.0,300.0,6,17647.058824


In [196]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out


In [197]:
df7 = remove_pps_outliers(df6)
df7.shape

(10242, 7)

In [198]:
df7.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
