In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [35]:
df= pd.read_csv("Bengaluru_House_Data.csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


## Now let's do some data analysis

In [36]:
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [37]:
df.nunique()

area_type          4
availability      81
location        1305
size              31
society         2688
total_sqft      2117
bath              19
balcony            4
price           1994
dtype: int64

In [38]:
df['availability'].unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [39]:
df['society'].unique()

array(['Coomee ', 'Theanmp', nan, ..., 'SJovest', 'ThhtsV ', 'RSntsAp'],
      dtype=object)

In [40]:
print(df['society'].value_counts())

GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Amionce     1
JaghtDe     1
Jauraht     1
Brity U     1
RSntsAp     1
Name: society, Length: 2688, dtype: int64


## By anaysing so far, I have decided to drop 'society' column 

In [41]:
df1= df.drop(['society'],axis='columns')
df1.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


## Data cleaning : Handling Null Values

In [42]:
df1.isnull().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

## Let's deal the NA values in the following manner
###  1) Since location is having only 1 NA value , we can drop it directly
### 2) For size,bath and balcony, I will replace NA values with "Median"

In [43]:
df1

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00


In [44]:
df1.dropna(subset=['location'])## drop na value in location column

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00


In [45]:
df1.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [46]:
df1['bath'].fillna(int(df1['bath'].median()),inplace=True)

In [47]:
df1['balcony'].fillna(int(df1['balcony'].median()),inplace=True)

In [48]:
df1.dropna(subset=['location'],inplace=True)## drop na value in location column

In [49]:
df1.isnull().sum()

area_type        0
availability     0
location         0
size            16
total_sqft       0
bath             0
balcony          0
price            0
dtype: int64

## For size column will first check the unique values and then.... will do something

In [50]:
df1['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

## I will remove String part & retain only the number part from size column

In [51]:
df2= df1.dropna()

In [52]:
df2.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [53]:
df2['size1']= df2['size'].apply(lambda x: int(x.split(' ')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['size1']= df2['size'].apply(lambda x: int(x.split(' ')[0]))


In [54]:
df2

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,size1
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,3600,5.0,2.0,400.00,4
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [55]:
df2.describe()

Unnamed: 0,bath,balcony,price,size1
count,13303.0,13303.0,13303.0,13303.0
mean,2.689619,1.602872,112.584033,2.803728
std,1.339393,0.803455,148.99382,1.295022
min,1.0,0.0,8.0,1.0
25%,2.0,1.0,50.0,2.0
50%,2.0,2.0,72.0,3.0
75%,3.0,2.0,120.0,3.0
max,40.0,3.0,3600.0,43.0


In [57]:
df2.rename(columns = {'size1':'nbhk'}, inplace = True)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns = {'size1':'nbhk'}, inplace = True)


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,nbhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [59]:
df2= df2.drop(['size'],axis='columns')
df2.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.0,2


In [69]:
df2

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5.0,3.0,120.00,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,2.0,3.0,62.00,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.00,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,3453,4.0,0.0,231.00,5
13316,Super built-up Area,Ready To Move,Richards Town,3600,5.0,2.0,400.00,4
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,1141,2.0,1.0,60.00,2
13318,Super built-up Area,18-Jun,Padmanabhanagar,4689,4.0,1.0,488.00,4


In [70]:
def f1(z):
    try:
        float(z)
    except:
        return False
    return True

In [73]:
df2[~df2['total_sqft'].apply(f1)]

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk
30,Super built-up Area,19-Dec,Yelahanka,2100 - 2850,4.0,0.0,186.000,4
56,Built-up Area,20-Feb,Devanahalli,3010 - 3410,2.0,2.0,192.000,4
81,Built-up Area,18-Oct,Hennur Road,2957 - 3450,2.0,2.0,224.500,4
122,Super built-up Area,18-Mar,Hebbal,3067 - 8156,4.0,0.0,477.000,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
...,...,...,...,...,...,...,...,...
12990,Super built-up Area,18-May,Talaghattapura,1804 - 2273,3.0,0.0,122.000,3
13059,Super built-up Area,Ready To Move,Harlur,1200 - 1470,2.0,0.0,72.760,2
13240,Super built-up Area,Ready To Move,Devanahalli,1020 - 1130,2.0,2.0,52.570,1
13265,Super built-up Area,20-Sep,Hoodi,1133 - 1384,2.0,0.0,59.135,2


## Let's work on total_sqft column

In [67]:
un= df2['total_sqft'].unique()

In [68]:
for i in un:
    print(i)

1056
2600
1440
1521
1200
1170
2732
3300
1310
1020
1800
2785
1000
1100
2250
1175
1180
1540
2770
600
1755
2800
1767
510
1250
660
1610
1151
1025
2100 - 2850
1075
1760
1693
1925
700
1070
1724
1290
1143
1296
1254
1330.74
970
1459
800
869
1270
1670
2010
1185
1600
3010 - 3410
1500
1407
840
4395
845
5700
1160
3000
1140
1220
1350
1005
500
1358
1569
1240
2089
1206
1150
2511
460
4400
1660
2957 - 3450
1326
1325
1499
1665
708
1060
710
1450
2894
1330
2502
650
2400
1007
966
1630
1640
782
1260
1413
1116
1530
3700
2497
1436
276
1427
2061
3067 - 8156
2650
1282
1050
945
950
1870
880
1535
1360
1042 - 1105
1280
5000
3050
1563.05
1167
4000
1828
890
1612
1034
1710
957
2795
1125
1735
2050
3750
1063
1904
4200
2000
1145 - 1340
1425
1470
1300
450
1152
1550
400
705
770
1242
1700
2144
1704
1846
1340
1015 - 1540
1327
1186
1783
1400
980
1285
912
1225
1909
1359
1207
1736
2850
1595
1798
1475
1580
1295
3600
589
1415
1787
984
1520 - 1740
2405
1080
1900
805
1153
1148
1110
1933
3500
645
1644
910
1577
4050
2420
900
1108
30

In [75]:
def handletotal_sqft(a):
    ans= a.split('-')
    if len(ans)==2:
        avg = (float(ans[0])+float(ans[1]))/2
        return avg
    try:
        return float(a)
    except:
        pass
    
    
        
        

In [76]:
df3= df2.copy()

In [77]:
df3.total_sqft= df3.total_sqft.apply(handletotal_sqft)

In [78]:
df3

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.00,4
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.00,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.00,3
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,3453.0,4.0,0.0,231.00,5
13316,Super built-up Area,Ready To Move,Richards Town,3600.0,5.0,2.0,400.00,4
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,1141.0,2.0,1.0,60.00,2
13318,Super built-up Area,18-Jun,Padmanabhanagar,4689.0,4.0,1.0,488.00,4


In [79]:
uni= df3['total_sqft'].unique()

In [80]:
for i in uni:
    print(i)

1056.0
2600.0
1440.0
1521.0
1200.0
1170.0
2732.0
3300.0
1310.0
1020.0
1800.0
2785.0
1000.0
1100.0
2250.0
1175.0
1180.0
1540.0
2770.0
600.0
1755.0
2800.0
1767.0
510.0
1250.0
660.0
1610.0
1151.0
1025.0
2475.0
1075.0
1760.0
1693.0
1925.0
700.0
1070.0
1724.0
1290.0
1143.0
1296.0
1254.0
1330.74
970.0
1459.0
800.0
869.0
1270.0
1670.0
2010.0
1185.0
1600.0
3210.0
1500.0
1407.0
840.0
4395.0
845.0
5700.0
1160.0
3000.0
1140.0
1220.0
1350.0
1005.0
500.0
1358.0
1569.0
1240.0
2089.0
1206.0
1150.0
2511.0
460.0
4400.0
1660.0
3203.5
1326.0
1325.0
1499.0
1665.0
708.0
1060.0
710.0
1450.0
2894.0
1330.0
2502.0
650.0
2400.0
1007.0
966.0
1630.0
1640.0
782.0
1260.0
1413.0
1116.0
1530.0
3700.0
2497.0
1436.0
276.0
1427.0
2061.0
5611.5
2650.0
1282.0
1050.0
945.0
950.0
1870.0
880.0
1535.0
1360.0
1073.5
1280.0
5000.0
3050.0
1563.05
1167.0
4000.0
1828.0
890.0
1612.0
1034.0
1710.0
957.0
2795.0
1125.0
1735.0
2050.0
3750.0
1063.0
1904.0
4200.0
2000.0
1242.5
1425.0
1470.0
1300.0
450.0
1152.0
1550.0
400.0
705.0
770.0
12

## area_type,availabilty & location column

### Let's add price/sqft

In [81]:
df4 = df3.copy()

In [83]:
df4['price_per_sqft']= df3['price']*100000/df3['total_sqft']
df4.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


## Let's explore location column

In [88]:
len(df4['location'].unique())

1304

In [87]:
stats= df4['price_per_sqft'].describe()
stats

count    1.325700e+04
mean     7.912825e+03
std      1.064976e+05
min      2.678298e+02
25%      4.271186e+03
50%      5.438596e+03
75%      7.313318e+03
max      1.200000e+07
Name: price_per_sqft, dtype: float64

## location column is categorical and ML odel won't understand categorical columns.

In [89]:
df4.location= df4.location.apply(lambda x: x.strip())
location_st = df4['location'].value_counts(ascending= False)
location_st

Whitefield                        540
Sarjapur  Road                    397
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Vasantapura main road               1
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
Abshot Layout                       1
Name: location, Length: 1293, dtype: int64

In [91]:
location_st.values.sum()

13303

In [93]:
len(location_st[location_st>10])

241

In [94]:
len(c

1039

## Dimensionality Reduction

### any location having less than 10 data points, will tagg it as 'other' location & then will do one hot encoding

In [96]:
location_st_less_than_10 = location_st[location_st<=10]
location_st_less_than_10

Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Vasantapura main road              1
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
Abshot Layout                      1
Name: location, Length: 1052, dtype: int64

In [97]:
df4.location = df4.location.apply(lambda x: 'other' if x in location_st_less_than_10 else x)
len(df4.location.unique())

242

In [98]:
df4.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


## Let's deal with the outliers

In [100]:
df4[df4.total_sqft/df4.nbhk<300].head() # This are the outliers

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk,price_per_sqft
9,Plot Area,Ready To Move,other,1020.0,6.0,2.0,370.0,6,36274.509804
45,Plot Area,Ready To Move,HSR Layout,600.0,9.0,2.0,200.0,8,33333.333333
58,Plot Area,Ready To Move,Murugeshpalya,1407.0,4.0,1.0,150.0,6,10660.98081
68,Plot Area,Ready To Move,Devarachikkanahalli,1350.0,7.0,0.0,85.0,8,6296.296296
70,Plot Area,Ready To Move,other,500.0,3.0,2.0,100.0,3,20000.0


## check above datapoints, We have 6 bhk apartment with 1020 sqft, also 8bhk in 600 sqft. So let's remve these datapoints

In [102]:
df5= df4[~(df4.total_sqft/df4.nbhk<300)]
df5

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.00,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.00,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.00,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,3453.0,4.0,0.0,231.00,5,6689.834926
13316,Super built-up Area,Ready To Move,other,3600.0,5.0,2.0,400.00,4,11111.111111
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,1141.0,2.0,1.0,60.00,2,5258.545136
13318,Super built-up Area,18-Jun,Padmanabhanagar,4689.0,4.0,1.0,488.00,4,10407.336319


## Now we have removed some outliers, but our data may contain more outliers, so let's deal with them using Standarad deviation & mean

In [103]:
df5.price_per_sqft.describe()

count     12513.000000
mean       6307.441644
std        4160.931610
min         267.829813
25%        4210.526316
50%        5295.007564
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [109]:
def remove_outliers(df):
    df_out= pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m= np.mean(subdf.price_per_sqft)
        st= np.std(subdf.price_per_sqft)
        reduced_df= subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out= pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [110]:
df6= remove_outliers(df5)
df6

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,nbhk,price_per_sqft
0,Super built-up Area,20-May,1st Block Jayanagar,2850.0,4.0,1.0,428.00,4,15017.543860
1,Super built-up Area,18-Jun,1st Block Jayanagar,1630.0,3.0,2.0,194.00,3,11901.840491
2,Super built-up Area,Ready To Move,1st Block Jayanagar,1875.0,2.0,3.0,235.00,3,12533.333333
3,Built-up Area,15-Dec,1st Block Jayanagar,1200.0,2.0,0.0,130.00,3,10833.333333
4,Super built-up Area,18-Jun,1st Block Jayanagar,1235.0,2.0,2.0,148.00,2,11983.805668
...,...,...,...,...,...,...,...,...,...
10281,Super built-up Area,Ready To Move,other,1353.0,2.0,2.0,110.00,2,8130.081301
10282,Plot Area,18-Jan,other,812.0,1.0,0.0,26.00,1,3201.970443
10283,Super built-up Area,18-Jul,other,1440.0,2.0,2.0,63.93,3,4439.583333
10284,Super built-up Area,Ready To Move,other,1075.0,2.0,2.0,48.00,2,4465.116279
