In [27]:
# import pandas
import pandas as pd
from sklearn.model_selection import train_test_split

## Viewing data


In [28]:
data = pd.read_csv('./main_data.csv')
data

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Plot Area,18-Nov,Channasandra,3 Bedroom,,1200,3.0,1.0,67.77
1,Super built-up Area,Ready To Move,Kothanur,3 BHK,Gilleon,1820,3.0,3.0,77.00
2,Super built-up Area,Ready To Move,Banashankari,2 BHK,,1040,2.0,3.0,45.00
3,Super built-up Area,18-Apr,Electronic City,3 BHK,Prarkun,1599,3.0,2.0,125.00
4,Built-up Area,Ready To Move,Kadugodi,3 BHK,MSingco,1260,2.0,2.0,54.00
...,...,...,...,...,...,...,...,...,...
9985,Built-up Area,Ready To Move,Yelahanka,2 BHK,AdniaSk,1250,2.0,2.0,54.00
9986,Super built-up Area,Ready To Move,Sarjapur Road,3 BHK,SoniaEt,2070,4.0,2.0,160.00
9987,Super built-up Area,18-Apr,Virat Nagar,3 BHK,,1215,2.0,2.0,49.75
9988,Built-up Area,Ready To Move,3rd Block Hrbr Layout,2 BHK,Icncy R,1320,2.0,3.0,90.00


In [29]:
# null values
data.isnull().sum()

area_type          0
availability       0
location           0
size              13
society         4107
total_sqft         0
bath              51
balcony          460
price              0
dtype: int64

## Feature selection


In [30]:
# dropping features
data.drop(columns=['area_type','society','balcony','availability'],inplace=True)


In [31]:
# describe
data

Unnamed: 0,location,size,total_sqft,bath,price
0,Channasandra,3 Bedroom,1200,3.0,67.77
1,Kothanur,3 BHK,1820,3.0,77.00
2,Banashankari,2 BHK,1040,2.0,45.00
3,Electronic City,3 BHK,1599,3.0,125.00
4,Kadugodi,3 BHK,1260,2.0,54.00
...,...,...,...,...,...
9985,Yelahanka,2 BHK,1250,2.0,54.00
9986,Sarjapur Road,3 BHK,2070,4.0,160.00
9987,Virat Nagar,3 BHK,1215,2.0,49.75
9988,3rd Block Hrbr Layout,2 BHK,1320,2.0,90.00


## bhk data mining

In [32]:
data['size'] = data['size'].fillna('2 BHK')
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

## location data mining

In [33]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_count = data['location'].value_counts()
location_less_then_10 = location_count[location_count<=10]

data['location'] = data['location'].apply(lambda x:'other' if x in location_less_then_10 else x)
data['location'].value_counts()

location
other                     2584
Whitefield                 429
Sarjapur  Road             288
Electronic City            229
Kanakpura Road             205
                          ... 
Gollarapalya Hosahalli      11
Kereguddadahalli            11
Cox Town                    11
Banashankari Stage V        11
Kodigehaali                 11
Name: count, Length: 193, dtype: int64

## total sqft data mining


In [34]:
# resolve range problem
def range_sqft(x):
    temp = x.split('-')
    try:
        if len(temp) == 2:
            return (float(temp[0])+float(temp[1]))/2
        else:
            return float(x)
    except:
        return 0.0
    
data['total_sqft'] = data['total_sqft'].apply(range_sqft)


In [35]:
data['total_sqft'].unique()

array([1200. , 1820. , 1040. , ..., 3170. , 1379. , 3416.5])

## bathroom missing value

In [36]:
data['bath'].isnull().sum()
data['bath'] = data['bath'].fillna(2.0)
data['bath'] = data['bath'].astype(float)
data['bhk'] = data['bhk'].astype(float)

data.drop(columns=['size'], inplace=True)
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9990 entries, 0 to 9989
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    9990 non-null   object 
 1   total_sqft  9990 non-null   float64
 2   bath        9990 non-null   float64
 3   price       9990 non-null   float64
 4   bhk         9990 non-null   float64
dtypes: float64(4), object(1)
memory usage: 390.4+ KB


## Outlier


In [37]:
# remove sqft outlier
import numpy as np
def remove_sqft(df):
    df_out = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.mean(subdf.price_per_sqft)
        gendf = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out,gendf],ignore_index=True)
    return df_out


In [38]:
# outlier for total_sqft
import matplotlib.pyplot as plt
import seaborn as sns
data['total_sqft'] = data[data['total_sqft'] > 0.0 ]['total_sqft']
data['price_per_sqft'] = (data['price']*100000) / data['total_sqft']

data = data[((data['total_sqft']/data['bhk']) >= 300)]
data = remove_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,9058.0,9058.0,9058.0,9058.0,9058.0
mean,1562.315895,2.514131,98.215645,2.610068,5811.248975
std,1169.2245,1.035246,106.779861,0.937648,2562.822994
min,300.0,1.0,8.44,1.0,267.829813
25%,1115.0,2.0,48.0,2.0,4184.55354
50%,1299.0,2.0,68.0,2.0,5230.706854
75%,1682.75,3.0,108.0,3.0,6666.666667
max,52272.0,16.0,2200.0,16.0,32000.0


## convert data frame to csv

In [39]:
data.drop(columns=['price_per_sqft'],inplace=True)
data.to_csv('../model/pure_data.csv',index=False)
response = data.drop(columns=['price'])
response.to_csv('../deploy/response.csv')