In [128]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib 

In [6]:
originaldf = pd.read_csv("homeprices.csv")
print( 'Shape of data frame : ', *originaldf.shape )
print( 'Columns in data frame : ', *originaldf.columns )
originaldf.head()

Shape of data frame :  13320 9
Columns in data frame :  area_type availability location size society total_sqft bath balcony price


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


#### Droping features that are not required

In [22]:
df = originaldf.copy()
df = df.drop(['area_type','society','balcony','availability'],axis='columns')

In [23]:
print( 'Shape of data frame : ', *df.shape )
print( 'Columns in data frame : ', *df.columns )
df.head()

Shape of data frame :  13320 5
Columns in data frame :  location size total_sqft bath price


Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


#### Droping rows with null values

In [45]:
print('Null values in data frame before dropping : \n', df.isnull().sum())
print('\nShape before dropping : ', df.shape)
df2 = df.dropna()
print('Null values in data frame after dropping : \n', df2.isnull().sum())
print('\nShape before dropping : ', df2.shape)
print('So number of rows dropped is equal to : ', df.shape[0] - df2.shape[0] )

Null values in data frame before dropping : 
 location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

Shape before dropping :  (13320, 5)
Null values in data frame after dropping : 
 location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

Shape before dropping :  (13246, 5)
So number of rows dropped is equal to :  74


#### correcting data so that every column has data of same unit

#### #Add new feature(integer) for bhk (Bedrooms Hall Kitchen)

In [53]:
df3 = df2.copy()
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3 = df3.drop(['size'], axis='columns')
print(df3['bhk'].unique())
df3.head()

[ 2  4  3  6  1  8  7  5 11  9 27 10 19 16 43 14 12 13 18]


Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


#### #changing total_sqft feature in range form to integer

In [83]:
#finding values in total_sqft whoes values are in the form of range
df4 = df3.copy()
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True
print('No of columns having total_sqft values in the form of range is : ', \
      df4[~df4['total_sqft'].apply(is_float)].count()[0])

No of columns having total_sqft values in the form of range is :  190


In [84]:
def convert_sqft_to_num(total_sqft):
    values = total_sqft.split('-')
    if len(values) == 2:
        return (float(values[0])+float(values[1]))/2
    try:
        return float(total_sqft)
    except:
        return None
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4 = df4[df4['total_sqft'].notnull()]
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [85]:
print('No of columns having total_sqft values in the form of range after convertion : ', \
      df4[~df4['total_sqft'].apply(is_float)].count()[0])

No of columns having total_sqft values in the form of range after convertion :  0


#### #Add new feature called price per square feet

In [86]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


#### Handling Categorical values

In [117]:
df6 = df5.copy()
df6['location'] = df6['location'].apply(lambda x: x.strip())
location_stats = df6['location'].value_counts(ascending=False)
print('Count of unique categorical values in datafarme : \n', location_stats.head() )
print('No of location name which used more than 10 : ', len(location_stats[location_stats>10]))
print('No of location name which used less than 10 : ', len(location_stats[location_stats<=10]))
print('No of location different name : ', len(location_stats))

Count of unique categorical values in datafarme : 
 Whitefield         533
Sarjapur  Road     392
Electronic City    304
Kanakpura Road     264
Thanisandra        235
Name: location, dtype: int64
No of location name which used more than 10 :  240
No of location name which used less than 10 :  1047
No of location different name :  1287


In [122]:
#Any location having less than 10 data points should be tagged as "other" location
location_stats_less_than_10 = location_stats[location_stats<=10]
df6['location'] = df6['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
df6.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,1310.0,3.0,63.25,3,4828.244275
9,other,1020.0,6.0,370.0,6,36274.509804
