In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

In [3]:
df1 = pd.read_csv("BangaloreHousePrice.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


Dataset downloadded from:  https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

# Data Cleaning

In [4]:
df1.shape

(12216, 9)

In [5]:
df1.groupby('area_type')['area_type'].agg('count') # Examining area type feature

area_type
Built-up  Area          2234
Carpet  Area              78
Plot  Area              1867
Super built-up  Area    8037
Name: area_type, dtype: int64

In [6]:
df1['area_type'].value_counts() # same as above

Super built-up  Area    8037
Built-up  Area          2234
Plot  Area              1867
Carpet  Area              78
Name: area_type, dtype: int64

In [7]:
 df1['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [8]:
df1.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [9]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.shape    # Dropping features that are not required

(12216, 5)

In [10]:
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


### Handling Na Values

In [11]:
df2.isnull().sum()

location       2
size          17
total_sqft     1
bath          71
price          1
dtype: int64

In [12]:
df3 = df2.dropna()   # since na values are minimal compare to dataset, we can drop
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [13]:
df3.shape

(12144, 5)

### Feature Engineering

In [14]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [15]:
## Adding new feature(integer) for bhk (Bedrooms Hall Kitchen)

df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))  # splitting based on space and taking first token which is number
df3.bhk.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [16]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [17]:
df3=df3.drop(['size'], axis='columns')

In [18]:
df3['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

We got some outlier like 43 bedroom. Let's explore it

In [19]:
df3[df3['bhk']>20]  # How many appartment has more than 20 bedroom

Unnamed: 0,location,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,8000,27.0,230.0,27
4684,Munnekollal,2400,40.0,660.0,43


2400 sqft and 43 bedroom! I am gonna fix this little later. We will fix this using total_sqft variable

In [20]:
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1783 - 1878', '120Sq. Yards', '3729'],
      dtype=object)

Opps! we have some square fit values as range(1783 - 1878), some have strings in values. Let me fix this first

In [21]:
def is_float(x):   # custom function for converting as float, whether not possible it will return false
    try:
        float(x)
    except:
        return False
    return True

In [22]:
df3[~df3['total_sqft'].apply(is_float)].head(10) # applying is_float() function
                                                 # ~ negate operation to show whether apply function return False
                                                 # showing sample of 10 values

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2
648,Arekere,4125Perch,9.0,265.0,9
661,Yelahanka,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,3090 - 5002,4.0,445.0,4


We have not uniformed data here. Some ranges, some contain strings.
**Typically, dataset can contain unstructured, outliears, not uniformed, data error problem**
I am gonna average of range and skip all values which contain strings

In [23]:
def convert_sqft_to_num(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0])+float(tokens[1]))/2   # mean value of range
        if float(x):
            return float(x)       # Trying to convert as float if possible
    except:
        return None

In [24]:
convert_sqft_to_num('1400-700')

1050.0

In [25]:
convert_sqft_to_num('1400rs')

In [26]:
convert_sqft_to_num('1400')

1400.0

In [27]:
df3['total_sqft'] = df3['total_sqft'].apply(convert_sqft_to_num)  # Apply function take lambda function & native python function
df3.head(2)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4


In [28]:
df3.loc[30]    # Checking how does perform apply function

location      Yelahanka
total_sqft         2475
bath                  4
price               186
bhk                   4
Name: 30, dtype: object

In [29]:
df3.loc[410]

location      Kengeri
total_sqft        NaN
bath                1
price            18.5
bhk                 1
Name: 410, dtype: object

In [30]:
df3['total_sqft'].isnull().sum() #  We got some na values for string inputs

43

In [31]:
df3=df3.dropna()

In [32]:
df3['total_sqft'].isnull().sum()

0

In [33]:
df3.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


# Feature Engineering ( Exploring new features)

In [34]:
df4=df3.copy()

df4['price_per_sqft'] = df4['price']*100000/df4['total_sqft']
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [36]:
df4_stats = df4['price_per_sqft'].describe()
df4_stats

count    1.210100e+04
mean     8.018258e+03
std      1.114583e+05
min      2.678298e+02
25%      4.266667e+03
50%      5.444646e+03
75%      7.317073e+03
max      1.200000e+07
Name: price_per_sqft, dtype: float64

In [38]:
df3['location'].unique() # observing location column

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       'basaveshwarnagar', 'Banashankari 3rd stage, Vivekanandanagar',
       'Annaiah Reddy Layout'], dtype=object)

In [39]:
len(df3['location'].unique())  

1262

So, there are 1262 unique location. We can not convert this into dummy columns or one hot encoding. Because it will make almost  1262 new columns. THis is called high dimensionality problem. 
1. We have to reduce dimension 
2. We can remove other fetures which has only 1-2 minimal data point

# Dimensionality Reduction

In [41]:
df4.location = df4['location'].apply(lambda x: x.strip())  # removing extra space from location column

In [43]:
location_stats = df4['location'].value_counts(ascending=False)
location_stats

Whitefield                    482
Sarjapur  Road                347
Electronic City               280
Kanakpura Road                246
Thanisandra                   225
                             ... 
BTM Layout 1stage 9th Main      1
Navodaya Nagar                  1
Byrasandra Extension            1
Kudlu Village,                  1
Kanakapura  Rod                 1
Name: location, Length: 1252, dtype: int64

In [44]:
len(location_stats[location_stats>10])     # Exploring how many have location have greater than 10 times

228

In [45]:
len(location_stats[location_stats<10])    # location that have less than 10 times

1010

In [47]:
df4.shape                  # So, we have 1010 rows with other or minimal times location, out of 12101
                           # We, can put them into new row named 'others'

(12101, 6)

In [48]:
# Any location having less than 10 data points should be tagged as "other" location.

location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

Sadashiva Nagar               10
Bharathi Nagar                10
Gunjur Palya                  10
Shivaji Nagar                 10
Nagasandra                    10
                              ..
BTM Layout 1stage 9th Main     1
Navodaya Nagar                 1
Byrasandra Extension           1
Kudlu Village,                 1
Kanakapura  Rod                1
Name: location, Length: 1024, dtype: int64

In [50]:
len(df4.location.unique())

1252

In [51]:
df4['location'] = df4['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
# Creating other rows with all those minimal values

In [52]:
len(df4.location.unique())  # let's see how many dimension now we have in the location column

229

In [54]:
df4.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,1310.0,3.0,63.25,3,4828.244275
9,other,1020.0,6.0,370.0,6,36274.509804
