In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_dir = './data/clean_data.csv'

### Load the cleaned data from the last step

In [3]:
df = pd.read_csv(data_dir)

df.head()

Unnamed: 0.1,Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13218 entries, 0 to 13217
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  13218 non-null  int64  
 1   location    13218 non-null  object 
 2   size        13218 non-null  object 
 3   total_sqft  13218 non-null  float64
 4   bath        13218 non-null  float64
 5   balcony     13218 non-null  float64
 6   price       13218 non-null  float64
 7   bhk         13218 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 826.2+ KB


In [5]:
df.isna().sum()

Unnamed: 0    0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
bhk           0
dtype: int64

#### So, the data is clean and has no NA values and can be used for Feature Engineering.

## Feature Engineering

#### 1. A new column called price_per_sqft can be created which will depict an important aspect of properties that is the price rate per sqft.

    Note: The price column is in lacs, i.e. the actual price is price * 1,00,000

In [6]:
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

df.head()

Unnamed: 0.1,Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


#### 2. The location column is text type but it is an important feature for pricing. It can thus be processed.

* Check all the unique values in location
* Group by location and see the count for every location
* It is a categorical feature rather than a numeric/continuous feature, so it can be encoded to categorical vectors
* It can be seen that there are many locations which don't have a significant number of houses so they all can be grouped under a 'Other' category.
* The above step is required because on converting all the locations to encoded vectors, there will be huge number of features.

In [7]:
# Trim all the values in location coloumn
df['location'] = df['location'].apply(lambda x: x.strip())

In [12]:
location_series = df.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_series

location
Whitefield              533
Sarjapur  Road          392
Electronic City         304
Kanakpura Road          264
Thanisandra             236
                       ... 
Kumbhena Agrahara         1
Kudlu Village,            1
Konappana Agrahara        1
Kodanda Reddy Layout      1
1 Annasandrapalya         1
Name: location, Length: 1287, dtype: int64

In [17]:
len(location_series[location_series <= 10])

1047

#### Out of 1287 locations 1033 locations have houses less than 10

In [18]:
len(df['location'].unique())

1287

In [19]:
location_series_less_than_10 = location_series[location_series <= 10]
location_series_less_than_10

location
Thyagaraja Nagar        10
Sector 1 HSR Layout     10
Ganga Nagar             10
Dairy Circle            10
Nagappa Reddy Layout    10
                        ..
Kumbhena Agrahara        1
Kudlu Village,           1
Konappana Agrahara       1
Kodanda Reddy Layout     1
1 Annasandrapalya        1
Name: location, Length: 1047, dtype: int64

#### Now these locations can all be combined in a single category called 'other'

In [24]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_series_less_than_10 else x)

len(df['location'].unique())

241

#### The number of categories/locations has significantly reduced

In [25]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0
5,5,Whitefield,2 BHK,1170.0,2.0,1.0,38.0,2,3247.863248
6,6,Old Airport Road,4 BHK,2732.0,4.0,2.0,204.0,4,7467.057101
7,7,Rajaji Nagar,4 BHK,3300.0,4.0,2.0,600.0,4,18181.818182
8,8,Marathahalli,3 BHK,1310.0,3.0,1.0,63.25,3,4828.244275
9,9,other,6 Bedroom,1020.0,6.0,2.0,370.0,6,36274.509804


### These locations can now be encoded using one-hot encoding for further processing

#### Save the dataset as csv

In [26]:
df.to_csv('./data/engineered_data.csv')