# Exploratory Data Analysis

In [79]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

In [80]:
# Data frame
data = pd.read_csv('Data/bengaluru_house_prices.csv')

In [81]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [82]:
data.shape

(13320, 9)

In [83]:
# Data types of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [84]:
# Statistical overview of numeric data
data.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


## Data Cleaning

***Duplicates***

In [85]:
# Check Duplicates
data.duplicated().sum()

np.int64(529)

In [86]:
# Drop duplicate entries
data.drop_duplicates(inplace=True)

***Missing values***

In [136]:
# Missing values
data.isna().sum()

location                0
size                    0
total_sqft              0
bath                    0
balcony                 0
price                   0
Built-up  Area          0
Plot  Area              0
Super built-up  Area    0
date                    0
month                   0
bhk                     0
dtype: int64

In [88]:
# location column
data.dropna(subset=['location'], inplace=True)

In [89]:
# bath column
data['bath'] = data['bath'].fillna(np.floor(data.bath.mean()))

In [90]:
# balcony column
data['balcony'] = data['balcony'].fillna(np.floor(data.balcony.mean()))

In [91]:
# Dropping society column as 50% values in the column are missing
data = data.drop('society', axis='columns')

In [135]:
# Drop NA values
data = data.dropna(subset=['size'])

***Categorical Features***

In [92]:
data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


*Area_Type Column*

In [93]:
# Unique categories
data.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [94]:
# Category wise count
data.area_type.value_counts()

area_type
Super built-up  Area    8316
Built-up  Area          2398
Plot  Area              1989
Carpet  Area              87
Name: count, dtype: int64

In [95]:
# Dummy variables for area_type categories
area_types = pd.get_dummies(data.area_type,dtype=int).drop(['Carpet  Area'], axis='columns')
area_types

Unnamed: 0,Built-up Area,Plot Area,Super built-up Area
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,0,0,1
...,...,...,...
13314,0,0,1
13315,1,0,0
13316,0,0,1
13317,1,0,0


In [96]:
# Concating dummies and main data frame
data = pd.concat([data,area_types], axis='columns')
data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,Built-up Area,Plot Area,Super built-up Area
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,0,0,1
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,0,1,0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,1,0,0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,0,0,1
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,0,0,1


In [None]:
# Dropping area_type column
data = data.drop('area_type', axis='columns')
data.head()

*Availability Column*

In [None]:
# Unique values
data.availability.unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [None]:
# Create date column
data['date'] = data.availability.str.split('-').str[0]

In [110]:
data.head()

Unnamed: 0,availability,location,size,total_sqft,bath,balcony,price,Built-up Area,Plot Area,Super built-up Area,date,month
0,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,0,0,1,19,Dec
1,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,0,1,0,1,Jan
2,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,1,0,0,1,Jan
3,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,0,0,1,1,Jan
4,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,0,0,1,1,Jan


In [None]:
# Create Month column

In [105]:
# Create Month column
data['month'] = data.availability.str.split('-').str[1]

In [107]:
# Encoding 'Ready To Move' & 'Immediate Possession'

data.date = data.date.replace({"Ready To Move" : '01', "Immediate Possession" : '01'})

In [109]:
# NaN values in Month column 
data.month = data.month.fillna('Jan')

In [111]:
# Unique values
data.date.unique()

array(['19', '01', '18', '20', '17', '21', '22', '16', '14', '15'],
      dtype=object)

In [114]:
# Unique Values
data.month.unique()

array([12,  1,  5,  2, 11, 10,  9,  3,  4,  8,  6,  7])

In [113]:
# Mapping values
data.month = data.month.map({'Dec': 12, "Jan" : 1, "May" : 5, "Feb" : 2, "Nov" : 11, "Oct" : 10, 'Sep' : 9, 'Mar' : 3, 'Apr' : 4, 'Aug' : 8, 'Jun' : 6, 'Jul' : 7})

In [None]:
# data type
data.month.dtype

dtype('int64')

In [117]:
# Change data type of Date values to integers
data.date = data.date.astype(int)

# data type
data.date.dtype

dtype('int64')

In [119]:
# Drop availability column
data = data.drop('availability', axis='columns')

In [141]:
data.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,Built-up Area,Plot Area,Super built-up Area,date,month,bhk
0,Electronic City Phase II,1056,2.0,1.0,39.07,0,0,1,19,12,2
1,Chikka Tirupathi,2600,5.0,3.0,120.0,0,1,0,1,1,4
2,Uttarahalli,1440,2.0,3.0,62.0,1,0,0,1,1,3
3,Lingadheeranahalli,1521,3.0,1.0,95.0,0,0,1,1,1,3
4,Kothanur,1200,2.0,1.0,51.0,0,0,1,1,1,2


*Size Column*

In [133]:
# Unique values
data['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [130]:
# BHK count
data['bhk'] = data['size'].str.strip('').str[0]

In [138]:
# Data type
data.bhk.dtype

dtype('int64')

In [137]:
# Change Data type
data.bhk = data.bhk.astype(int)

In [None]:
# Drop Size column
data = data.drop('size', axis='columns')

*Total Sqft Column*

In [142]:
# Unique values
data.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      shape=(2110,), dtype=object)

- The values in total_sqft column are not uniform
- Some instances have absolute values, some have range of values, so are in different units

In [148]:
# Function to filter non absolute values

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

# Filtering data frame

data[~data['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,total_sqft,bath,balcony,price,Built-up Area,Plot Area,Super built-up Area,date,month,bhk


In [None]:
# Function to correct erroneous entries

def convert(x):
    tokens = x.split('-')
    if len(tokens)==2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None    

In [147]:
#  Correcting erroneous entries
data['total_sqft'] = data['total_sqft'].apply(convert)


In [149]:
data.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,Built-up Area,Plot Area,Super built-up Area,date,month,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,0,0,1,19,12,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,0,1,0,1,1,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,1,0,0,1,1,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,0,0,1,1,1,3
4,Kothanur,1200.0,2.0,1.0,51.0,0,0,1,1,1,2


*Location Column*

In [124]:
# Unique Values
data.location.value_counts()

location
Whitefield          523
Sarjapur  Road      379
Electronic City     287
Kanakpura Road      249
Thanisandra         229
                   ... 
Udayagiri             1
pavitra paradise      1
kadubisnahalli        1
Rahmath Nagar         1
Chikkanahalli         1
Name: count, Length: 1305, dtype: int64

## Feature Engineering