In [30]:
#import all the required libraries

import numpy as np
import pandas as pd


In [31]:
data = pd.read_csv("Bengaluru_House_Data.csv")
#print(data)
#data.info()
#data.describe()
#print(data.head(10))
#print(data.shape)
print(data.shape)

(13320, 9)


In [32]:
#data preprocessing getting the count of area type in the datasets
print(data.groupby('area_type')['area_type'].agg('count'))

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64


In [33]:
#dropping unnecessary columns
# eg. society, area_type,availability,balcony

data.drop(['area_type','society','availability','balcony'],axis='columns',inplace=True)
print(data.shape)

(13320, 5)


In [34]:
#Data Cleaning 
print(data.isnull().sum())
data.dropna(inplace=True)
print(data.shape)

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64
(13246, 5)


In [37]:
#Data Enginerring
#print(data)
#through this code We are adding one more column called and bhk 
#because size (house size) has different values so we will extract the numeric value from each
print(data['size'].unique())
data['bhk'] = data['size'].apply(lambda x: float(x.split(' ')[0]))

['2 BHK' '4 Bedroom' '3 BHK' '4 BHK' '6 Bedroom' '3 Bedroom' '1 BHK'
 '1 RK' '1 Bedroom' '8 Bedroom' '2 Bedroom' '7 Bedroom' '5 BHK' '7 BHK'
 '6 BHK' '5 Bedroom' '11 BHK' '9 BHK' '9 Bedroom' '27 BHK' '10 Bedroom'
 '11 Bedroom' '10 BHK' '19 BHK' '16 BHK' '43 Bedroom' '14 BHK' '8 BHK'
 '12 Bedroom' '13 BHK' '18 Bedroom']


In [40]:
#now exploring the total_sqft 
print(data['total_sqft'].unique())

['1056' '2600' '1440' ... '1133 - 1384' '774' '4689']


In [41]:
#defining a function to check whether the value is float or not
def is_float(x):
    try :
        float(x)
    except :
        return False
    return True

In [42]:
print(data[~data['total_sqft'].apply(is_float)].head(10))

               location       size      total_sqft  bath    price  bhk
30            Yelahanka      4 BHK     2100 - 2850   4.0  186.000  4.0
122              Hebbal      4 BHK     3067 - 8156   4.0  477.000  4.0
137  8th Phase JP Nagar      2 BHK     1042 - 1105   2.0   54.005  2.0
165            Sarjapur      2 BHK     1145 - 1340   2.0   43.490  2.0
188            KR Puram      2 BHK     1015 - 1540   2.0   56.800  2.0
410             Kengeri      1 BHK  34.46Sq. Meter   1.0   18.500  1.0
549         Hennur Road      2 BHK     1195 - 1440   2.0   63.770  2.0
648             Arekere  9 Bedroom       4125Perch   9.0  265.000  9.0
661           Yelahanka      2 BHK     1120 - 1145   2.0   48.130  2.0
672        Bettahalsoor  4 Bedroom     3090 - 5002   4.0  445.000  4.0


In [45]:
#defining a function to convert the range column values to a single value
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2 :
        return (float(tokens[0])+ float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
#testing the function
#above function testing with values
print(convert_sqft_to_num('290'))
print(convert_sqft_to_num('2100-2850'))
print(convert_sqft_to_num('4.46Sq. Meter'))

290.0
2475.0
None


In [46]:
#apply  this function to the datasets to modify the total_sqt data column values

data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)
print(data['total_sqft'].head(10))
print(data.loc[30])

0    1056.0
1    2600.0
2    1440.0
3    1521.0
4    1200.0
5    1170.0
6    2732.0
7    3300.0
8    1310.0
9    1020.0
Name: total_sqft, dtype: float64
location      Yelahanka
size              4 BHK
total_sqft       2475.0
bath                4.0
price             186.0
bhk                 4.0
Name: 30, dtype: object


In [47]:
print(data.head(10))

                   location       size  total_sqft  bath   price  bhk
0  Electronic City Phase II      2 BHK      1056.0   2.0   39.07  2.0
1          Chikka Tirupathi  4 Bedroom      2600.0   5.0  120.00  4.0
2               Uttarahalli      3 BHK      1440.0   2.0   62.00  3.0
3        Lingadheeranahalli      3 BHK      1521.0   3.0   95.00  3.0
4                  Kothanur      2 BHK      1200.0   2.0   51.00  2.0
5                Whitefield      2 BHK      1170.0   2.0   38.00  2.0
6          Old Airport Road      4 BHK      2732.0   4.0  204.00  4.0
7              Rajaji Nagar      4 BHK      3300.0   4.0  600.00  4.0
8              Marathahalli      3 BHK      1310.0   3.0   63.25  3.0
9              Gandhi Bazar  6 Bedroom      1020.0   6.0  370.00  6.0


In [50]:
#creating new column 'price_per_sqft' as we know in real state per sqft price matters a lot
data['price_per_sqft'] = data['price']*100000/data['total_sqft']
print(data['price_per_sqft'])

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13246, dtype: float64


In [51]:
#now lets explore the location column
print(len(data['location'].unique()))

1304


In [52]:
data['location'] = data['location'].apply(lambda x: x.strip())

In [54]:
location_stats = data.groupby('location')['location'].agg('count').sort_values(ascending=False)
print(location_stats[0:10])

location
Whitefield               535
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           266
Thanisandra              236
Yelahanka                210
Uttarahalli              186
Hebbal                   176
Marathahalli             175
Raja Rajeshwari Nagar    171
Name: location, dtype: int64
