In [63]:
import pandas as pd
import numpy as np 

In [64]:
data = pd.read_csv("./Bengaluru_House_Data.csv")


In [65]:
data.shape

(13320, 9)

In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


There are NaN values in the dataset

In [67]:
data.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


There is need of modification in the data to make it understandable to machine 

In [68]:
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [69]:
#drop the 'socity' and 'balcony' from the dataset because of many missing data
#drop the 'area_type' and 'availability' because those are not required

data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

In [70]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [71]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [73]:
# removing the single NaN in 'location' by replacing it with max occured element "Whitefield"
data['location'].fillna('Whitefield', inplace=True) 

In [79]:
# replacing the Nan in 'size' with most occurred element '2 BHK'
data['size'].fillna('2 BHK', inplace=True)

In [82]:
# replacing the NaN in 'bath' with median of bath 
data['bath'].fillna(data['bath'].median(), inplace=True)

In [83]:
# checking for NaN values in any feature or column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [84]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [86]:
data['bhk'].value_counts()

2     5544
3     4857
4     1417
1      656
5      356
6      221
7      100
8       89
9       54
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: bhk, dtype: int64

In [88]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [107]:
# there are some inputs which are given as range in 'total_sqft' 
# replacing them with mean of that range

def rmv_range(x) : 
    temp = x.split(' - ') 
    try :
        return ( ( float(temp[0]) + float(temp[1]) ) / 2 ) if len(temp) == 2 else float(x)
    except :
        return None 

In [108]:
data['total_sqft'] = data['total_sqft'].apply(rmv_range)

In [109]:
# adding a new feature price per square feet
data['price_per_sqft'] = data['price']*1000000 / data['total_sqft']


In [110]:
data['location'].value_counts()

Whitefield                        541
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [111]:
data['location'] = data['location'].apply(lambda x: x.strip())
data['location'].value_counts()

Whitefield                        542
Sarjapur  Road                    399
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1294, dtype: int64

In [113]:
#replace the places which come less than or equal to 10 by 'other'
loc_count = data['location'].value_counts()
loc_count_lh_10 = loc_count[loc_count <= 10]
data['location'] = data['location'].apply(lambda x : 'Other' if x in loc_count_lh_10 else x)

In [114]:
# checking for outlier in the 'total_sqft' 
# by checking the area/bhk 

(data['total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [115]:
# min is 0.25 
# bhk of 0.25 sq feet is not feesible 
# romoving the elements whose area/bhk < 300 

data = data[ (data['total_sqft']/data['bhk']) >= 300 ]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,63039.79
std,1261.271296,1.077938,152.077329,0.976678,41622.38
min,300.0,1.0,8.44,1.0,2678.298
25%,1116.0,2.0,49.0,2.0,42105.26
50%,1300.0,2.0,70.0,3.0,52941.18
75%,1700.0,3.0,115.0,3.0,69166.67
max,52272.0,16.0,3600.0,16.0,1764706.0


In [117]:
# removing outliers by location 
# if the price deviate from its mean more than its standard deviation then remove it 

def remove_outlier_sqft(df) : 
    df_output = pd.DataFrame() 
    for key, subdf in df.groupby('location') : 
        m = subdf['price_per_sqft'].mean() 
        st = subdf['price_per_sqft'].std() 
        
        gen_df = subdf[ ((m-st) < subdf['price_per_sqft']) & (subdf['price_per_sqft']<= (m+st)) ]
        df_output = pd.concat([df_output, gen_df], ignore_index=True) 
    return df_output 
data = remove_outlier_sqft(data) 
data.describe()
        

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10350.0,10350.0,10350.0,10350.0,10350.0
mean,1508.646058,2.47256,91.347418,2.575556,56607.51992
std,879.853598,0.980519,86.425645,0.898481,22711.119711
min,300.0,1.0,10.0,1.0,12500.0
25%,1110.0,2.0,49.0,2.0,42397.941737
50%,1286.0,2.0,67.0,2.0,51755.196679
75%,1650.0,3.0,100.0,3.0,64318.086274
max,30400.0,16.0,2200.0,16.0,245098.039216


In [119]:
def bhk_outlier_remover(df) : 
    exclude_idx = np.array([]) 
    for _, loc_df in df.groupby('location') :
        bhk_stats = {} 
        for bhk, bhk_df in loc_df.groupby('bhk') : 
            bhk_stats = {
                'mean' : bhk_df['price_per_sqft'].mean(),
                'std' : bhk_df['price_per_sqft'].std(),
                'count' : bhk_df['price_per_sqft'].shape[0]
            }
        for bhk, bhk_df in loc_df.groupby('bhk') : 
            stats = bhk_stats.get(bhk-1) 
            if stats and stats.count() > 5 : 
                exclude_idx = np.append(exclude_idx, bhk_df[bhk_df['price_per_sqft'] < stats['mean']].index.values)
    return df.drop(exclude_idx, axis='index')
data = bhk_outlier_remover(data)

In [120]:
data.size

72450

In [126]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,150175.438596
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,119018.404908
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,125333.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,108333.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,119838.05668


In [127]:
# removing the extra features 'size' and 'price_per_sqft' 
data.drop(columns=['size', 'price_per_sqft'], inplace=True)

In [128]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [129]:
# saving the cleaned data 
data.to_csv('cleaned_data.csv')

In [140]:
# X : input data 
# y : result data 

X, y = data.drop(columns=['price']), data['price'] 

In [141]:
# importing required libraries 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, Lasso, Ridge 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import r2_score 

In [142]:
# spliting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [150]:
# OneHotEncoder will transform the location element into integer
column_trans = make_column_transformer( (OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [149]:
# ------------------- Linear Regression ---------------------------------
 
pipe = make_pipeline(column_trans, StandardScaler(), LinearRegression(normalize=True))
pipe.fit(X_train, y_train)
y_pred_lr = pipe.predict(X_test)
r2_score(y_test, y_pred_lr)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




0.782462671966845

In [152]:
# ------------------------------------- lasso --------------------------------------

pipe = make_pipeline(column_trans, StandardScaler(), Lasso()) 
pipe.fit(X_train, y_train) 
y_pred_lasso = pipe.predict(X_test) 
r2_score(y_test, y_pred_lasso)  


0.7734594195952711

In [154]:
# ------------------------------- ridge --------------------------- 

pipe = make_pipeline(column_trans, StandardScaler(), Ridge()) 
pipe.fit(X_train, y_train) 
y_pred_ridge = pipe.predict(X_test) 
r2_score(y_test, y_pred_ridge)

0.7825333561248698

In [155]:
print('no regularization', r2_score(y_test, y_pred_lr))
print('lasso', r2_score(y_test, y_pred_lasso))
print('ridge', r2_score(y_test, y_pred_ridge)) 

no regularization 0.782462671966845
lasso 0.7734594195952711
ridge 0.7825333561248698


In [None]:
# ridge has highest r2_score 
import pickle 
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))