In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 624.4+ KB


In [6]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK        

In [8]:
# dropping data which is of no use
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [9]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 364.3+ KB


## Filling the null values one by one

In [11]:
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [12]:
data['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [13]:
data['size'] = data['size'].fillna('2 BHK')

In [14]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [15]:
data['location'] = data['location'].fillna('Sarjapur Road')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 364.3+ KB


In [17]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [18]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [19]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [20]:
def ConvertRange(x):
    
    temp = x.split('-')
    if len(temp) ==2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [21]:
data['total_sqft'] = data['total_sqft'].apply(ConvertRange)

In [22]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


### Price per square feet

In [23]:
data['price_per_sqft'] = data['price']*10000 / data['total_sqft']

In [24]:
data['price_per_sqft']

0         369.981061
1         461.538462
2         430.555556
3         624.589086
4         425.000000
            ...     
13315     668.983493
13316    1111.111111
13317     525.854514
13318    1040.733632
13319     309.090909
Name: price_per_sqft, Length: 13320, dtype: float64

In [25]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,790.7501
std,1238.405258,1.338754,148.971674,1.294496,10642.96
min,1.0,1.0,8.0,1.0,26.78298
25%,1100.0,2.0,50.0,2.0,426.6865
50%,1276.0,2.0,72.0,3.0,543.4306
75%,1680.0,3.0,120.0,3.0,731.1746
max,52272.0,40.0,3600.0,43.0,1200000.0


In [26]:
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: location, Length: 1306, dtype: int64

In [27]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_count = data['location'].value_counts()

In [28]:
location_count

Whitefield                            541
Sarjapur  Road                        399
Electronic City                       304
Kanakpura Road                        273
Thanisandra                           237
                                     ... 
1Channasandra                           1
Hosahalli                               1
Vijayabank bank layout                  1
near Ramanashree California resort      1
Abshot Layout                           1
Name: location, Length: 1295, dtype: int64

In [29]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: location, Length: 1054, dtype: int64

In [30]:
data['location']=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [31]:
data['location'].value_counts()

other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: location, Length: 242, dtype: int64

## Outlier detection and removal

In [32]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,790.7501
std,1238.405258,1.338754,148.971674,1.294496,10642.96
min,1.0,1.0,8.0,1.0,26.78298
25%,1100.0,2.0,50.0,2.0,426.6865
50%,1276.0,2.0,72.0,3.0,543.4306
75%,1680.0,3.0,120.0,3.0,731.1746
max,52272.0,40.0,3600.0,43.0,1200000.0


In [33]:
data.shape

(13320, 7)

In [34]:
data.price_per_sqft.describe()

count    1.327400e+04
mean     7.907501e+02
std      1.064296e+04
min      2.678298e+01
25%      4.266865e+02
50%      5.434306e+02
75%      7.311746e+02
max      1.200000e+06
Name: price_per_sqft, dtype: float64

In [35]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        
        st = np.std(subdf.price_per_sqft)
        
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index =True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,11488.0,11488.0,11488.0,11488.0,11488.0
mean,1537.97681,2.605066,105.238232,2.719185,630.308666
std,1145.700742,1.208022,138.520527,1.152969,363.65023
min,250.0,1.0,8.0,1.0,26.782981
25%,1100.0,2.0,50.0,2.0,428.571429
50%,1280.0,2.0,70.0,3.0,531.117047
75%,1660.0,3.0,110.0,3.0,684.960117
max,52272.0,27.0,3600.0,27.0,5315.0


In [36]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [37]:
data=bhk_outlier_remover(data)

In [38]:
data.describe

<bound method NDFrame.describe of                   location       size  total_sqft  bath  price  bhk  \
0      1st Block Jayanagar      4 BHK      2850.0   4.0  428.0    4   
1      1st Block Jayanagar      3 BHK      1630.0   3.0  194.0    3   
2      1st Block Jayanagar      6 BHK      1200.0   6.0  125.0    6   
3      1st Block Jayanagar      3 BHK      1875.0   2.0  235.0    3   
4      1st Block Jayanagar  7 Bedroom       930.0   4.0   85.0    7   
...                    ...        ...         ...   ...    ...  ...   
11479                other  7 Bedroom      1400.0   7.0  218.0    7   
11482                other      2 BHK      1353.0   2.0  110.0    2   
11483                other  1 Bedroom       812.0   1.0   26.0    1   
11486                other  4 Bedroom      1200.0   5.0  325.0    4   
11487                other      4 BHK      3600.0   5.0  400.0    4   

       price_per_sqft  
0         1501.754386  
1         1190.184049  
2         1041.666667  
3         1253.33

In [39]:
data

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,1501.754386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,1190.184049
2,1st Block Jayanagar,6 BHK,1200.0,6.0,125.0,6,1041.666667
3,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,1253.333333
4,1st Block Jayanagar,7 Bedroom,930.0,4.0,85.0,7,913.978495
...,...,...,...,...,...,...,...
11479,other,7 Bedroom,1400.0,7.0,218.0,7,1557.142857
11482,other,2 BHK,1353.0,2.0,110.0,2,813.008130
11483,other,1 Bedroom,812.0,1.0,26.0,1,320.197044
11486,other,4 Bedroom,1200.0,5.0,325.0,4,2708.333333


In [40]:
data.drop(columns=['size','price_per_sqft'],inplace=True)

## Cleaned Data

In [41]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1200.0,6.0,125.0,6
3,1st Block Jayanagar,1875.0,2.0,235.0,3
4,1st Block Jayanagar,930.0,4.0,85.0,7


In [42]:
data.to_csv("Cleaned_data.csv")

In [43]:
X = data.drop(columns=['price'])
y = data['price']

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [46]:
print(X_train.shape)
print(X_test.shape)

(6431, 4)
(1608, 4)


  ## Applying Linear Regression

In [47]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']),
                                      remainder='passthrough')

In [48]:
scaler = StandardScaler()

In [49]:
lr = LinearRegression(normalize=True)

In [50]:
pipe = make_pipeline(column_trans, scaler, lr)

In [51]:
pipe.fit(X_train,y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [54]:
y_pred_lr = pipe.predict(X_test)

In [55]:
r2_score(y_test, y_pred_lr)

0.7186946076876721

## Applying Lasso

In [56]:
lasso = Lasso()

In [57]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [58]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [59]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.714552117537556

## Applying Ridge

In [60]:
ridge = Ridge()

In [61]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [62]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [63]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.718657506349111

In [64]:
print("No Regularzation : ",r2_score(y_test, y_pred_lr))
print("Lasso : ",r2_score(y_test, y_pred_lasso))
print("Ridge : ", r2_score(y_test, y_pred_ridge))

No Regularzation :  0.7186946076876721
Lasso :  0.714552117537556
Ridge :  0.718657506349111
