In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:
df=pd.read_csv("bengaluru_house_prices.csv")
df.head(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.0


In [80]:
df.shape

(13320, 9)

### Data Preprocessing and Cleaning

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [82]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [83]:
df.society.unique()

array(['Coomee ', 'Theanmp', nan, ..., 'SJovest', 'ThhtsV ', 'RSntsAp'],
      shape=(2689,), dtype=object)

In [84]:
df.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [85]:
df.location.unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], shape=(1306,), dtype=object)

In [86]:
#Dropping Unessacery Columns
df.drop(['availability','society','area_type'],axis='columns',inplace=True)
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0
5,Whitefield,2 BHK,1170,2.0,1.0,38.0
6,Old Airport Road,4 BHK,2732,4.0,,204.0
7,Rajaji Nagar,4 BHK,3300,4.0,,600.0
8,Marathahalli,3 BHK,1310,3.0,1.0,63.25
9,Gandhi Bazar,6 Bedroom,1020,6.0,,370.0


### Handling NA values

In [87]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    12710 non-null  object 
 1   size        12710 non-null  object 
 2   total_sqft  12710 non-null  object 
 3   bath        12710 non-null  float64
 4   balcony     12710 non-null  float64
 5   price       12710 non-null  float64
dtypes: float64(3), object(3)
memory usage: 695.1+ KB


## Feature Engineering

In [88]:
#Size Column
df['size']=df['size'].apply(lambda x: int(str(x).split(' ')[0]))
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4,2600,5.0,3.0,120.0
2,Uttarahalli,3,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3,1521,3.0,1.0,95.0
4,Kothanur,2,1200,2.0,1.0,51.0
5,Whitefield,2,1170,2.0,1.0,38.0
8,Marathahalli,3,1310,3.0,1.0,63.25
10,Whitefield,3,1800,2.0,2.0,70.0
11,Whitefield,4,2785,5.0,3.0,295.0
12,7th Phase JP Nagar,2,1000,2.0,1.0,38.0


In [89]:
df['size'].unique()

array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13])

In [90]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [91]:
df[~df['total_sqft'].apply(is_float)].count()

location      186
size          186
total_sqft    186
bath          186
balcony       186
price         186
dtype: int64

In [92]:
df[~df['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,balcony,price
30,Yelahanka,4,2100 - 2850,4.0,0.0,186.0
122,Hebbal,4,3067 - 8156,4.0,0.0,477.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,0.0,54.005
165,Sarjapur,2,1145 - 1340,2.0,0.0,43.49
188,KR Puram,2,1015 - 1540,2.0,0.0,56.8
410,Kengeri,1,34.46Sq. Meter,1.0,0.0,18.5
549,Hennur Road,2,1195 - 1440,2.0,0.0,63.77
661,Yelahanka,2,1120 - 1145,2.0,0.0,48.13
672,Bettahalsoor,4,3090 - 5002,4.0,0.0,445.0
772,Banashankari Stage VI,2,1160 - 1195,2.0,0.0,59.935


In [93]:
df[~df['total_sqft'].apply(is_float)].sample(10)


Unnamed: 0,location,size,total_sqft,bath,balcony,price
8873,Carmelaram,3,1660 - 1805,3.0,0.0,91.855
2088,Yeshwanthpur,2,1250 - 1305,2.0,0.0,95.815
1484,Hebbal,2,547.34 - 827.31,2.0,0.0,42.72
9299,Balagere,1,645 - 936,1.0,0.0,42.295
9164,Jakkur,2,1230 - 1290,2.0,0.0,71.195
30,Yelahanka,4,2100 - 2850,4.0,0.0,186.0
10620,Old Madras Road,4,3630 - 3800,6.0,0.0,212.5
9149,Hosur Road,4,2800 - 2870,4.0,0.0,241.0
6552,5 Bedroom Farm House in Lakshmipura,5,24Guntha,6.0,2.0,550.0
12955,Thanisandra,3,1437 - 1629,3.0,0.0,75.885


In [94]:
def transform1(x):
    x=str(x)
    try:
        if is_float(x)==True:
            x=float(x)
        if 'Sq. Yards' in x :
            x=float(x.replace('Sq. Yards',""))*9.0
        if 'Sq. Meter' in x :
            x=float(x.replace('Sq. Meter',""))*10.7639
        if 'Acres' in x :
            x=float(x.replace('Acres',""))*43560
        if 'Grounds' in x :
            x=float(x.replace('Grounds',""))*2400
        if 'Cents' in x :
            x=float(x.replace('Cents',""))*435.56
        if 'Guntha' in x :
            x=float(x.replace('Guntha',""))*1089
        if '-' in x:
            lst=x.split()
            x=float(lst[0])+float(lst[2])
        return x
    except:
        return x

In [95]:
df['total_sqft']=df['total_sqft'].apply(transform1)
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0
2,Uttarahalli,3,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0
4,Kothanur,2,1200.0,2.0,1.0,51.0
5,Whitefield,2,1170.0,2.0,1.0,38.0
8,Marathahalli,3,1310.0,3.0,1.0,63.25
10,Whitefield,3,1800.0,2.0,2.0,70.0
11,Whitefield,4,2785.0,5.0,3.0,295.0
12,7th Phase JP Nagar,2,1000.0,2.0,1.0,38.0


In [97]:
df[~df['total_sqft'].apply(is_float)].count()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [98]:
df['Price_per_sqft']=df.price*100000/df.total_sqft
df.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,Price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,1.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,3.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,1.0,51.0,4250.0
5,Whitefield,2,1170.0,2.0,1.0,38.0,3247.863248
8,Marathahalli,3,1310.0,3.0,1.0,63.25,4828.244275
10,Whitefield,3,1800.0,2.0,2.0,70.0,3888.888889
11,Whitefield,4,2785.0,5.0,3.0,295.0,10592.459605
12,7th Phase JP Nagar,2,1000.0,2.0,1.0,38.0,3800.0


In [100]:
len(df.location.unique())

1265

In [None]:
df.location=df.location.apply(lambda x:x.strip())
