Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

Importing the Boston House Price Dataset

In [2]:
# Loading the dataset to a Pandas DataFrame
house_price_dataframe = pd.read_csv("BHP.csv")

In [3]:
# Print First 5 rows of our DataFrame
house_price_dataframe.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
house_price_dataframe.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
house_price_dataframe.tail()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.0
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.0
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.0
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.0
13319,Super built-up Area,Ready To Move,Doddathoguru,1 BHK,,550,1.0,1.0,17.0


In [6]:
# checking the number of rows and Columns in the data frame
house_price_dataframe.shape

(13320, 9)

In [7]:
for column in house_price_dataframe.columns:
    print(house_price_dataframe[column].value_counts())
    print("*"*30)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
******************************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
******************************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
******************************
2 

In [8]:
# check for missing values
house_price_dataframe.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [9]:
df=house_price_dataframe.drop(columns=["area_type","society","availability","balcony"])

In [10]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [11]:
df.shape

(13320, 5)

In [12]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [13]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [14]:
value1=df["location"].mode()[0]
print(value1)

Whitefield


In [15]:
df["location"]=df["location"].fillna(df["location"].mode()[0])
df["size"]=df["size"].fillna(df["size"].mode()[0])
df["bath"]=df["bath"].fillna(df["bath"].mode()[0])

In [16]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [18]:
df["bhk"]=df["size"].str.split().str.get(0).astype(int)

In [19]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [20]:
df["bhk"].value_counts()

2     5544
3     4857
4     1417
1      656
5      356
6      221
7      100
8       89
9       54
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: bhk, dtype: int64

In [21]:
#handling of the total_sqft column

In [22]:
df["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [23]:
def convert_range(x):
    try:
        temp = x.split("_")
        if len(temp) == 2:
            return (float(temp[0]) + float(temp[1])) / 2
        elif len(temp) == 1:
            return float(temp[0])
        else:
            return None
    except (ValueError, TypeError):
        return None


In [24]:
df["total_sqft"]=df["total_sqft"].apply(convert_range)

In [25]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


## creating new column called as price_per_sqft

In [26]:
df["price_per_sqft"]=(df["price"]*100000)/df["total_sqft"]

In [27]:
df.head(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [28]:
df=df.drop(columns=["size"])

In [29]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13073.0,13320.0,13320.0,13320.0,13073.0
mean,1554.942029,2.688814,112.565627,2.802778,7949.6
std,1238.458773,1.338754,148.971674,1.294496,107244.0
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4265.734
50%,1275.0,2.0,72.0,3.0,5454.545
75%,1670.0,3.0,120.0,3.0,7338.057
max,52272.0,40.0,3600.0,43.0,12000000.0


In [30]:
df["location"].value_counts()

Whitefield                        541
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

## removing the space before and after the word

In [31]:
df["location"]=df["location"].apply(lambda x:x.strip())

In [32]:
df["location"].value_counts()

Whitefield                        542
Sarjapur  Road                    399
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1294, dtype: int64

In [33]:
location_count=df["location"].value_counts()

In [34]:
location_count_less_10=location_count[location_count<10]

In [35]:
location_count_less_10

Vishwanatha Nagenahalli           9
Chennammana Kere                  9
2nd Phase JP Nagar                9
Jakkur Plantation                 9
B Narayanapura                    9
                                 ..
Bapuji Layout                     1
1st Stage Radha Krishna Layout    1
BEML Layout 5th stage             1
singapura paradise                1
Abshot Layout                     1
Name: location, Length: 1040, dtype: int64

In [36]:
df["location"]=df["location"].apply(lambda x : "other" if x in location_count_less_10 else x)

In [37]:
df["location"].value_counts()

other                  2755
Whitefield              542
Sarjapur  Road          399
Electronic City         304
Kanakpura Road          273
                       ... 
BTM 1st Stage            10
Basapura                 10
Sector 1 HSR Layout      10
Kalkere                  10
Nagadevanahalli          10
Name: location, Length: 255, dtype: int64

In [38]:
#calulating outlier for the sqft

(df["total_sqft"]/df["bhk"]).describe()

count    13073.000000
mean       573.254923
std        389.887823
min          0.250000
25%        472.000000
50%        551.000000
75%        625.000000
max      26136.000000
dtype: float64

In [39]:
df=df[(df["total_sqft"]/df["bhk"])>300]

In [40]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12147.0,12147.0,12147.0,12147.0,12147.0
mean,1596.728917,2.54392,110.876677,2.633654,6225.053321
std,1269.496189,1.059459,153.495355,0.9612,4072.524858
min,340.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4200.0
50%,1305.0,2.0,69.5,3.0,5274.725275
75%,1702.0,3.0,115.0,3.0,6845.926914
max,52272.0,16.0,3600.0,16.0,176470.588235


In [41]:
df.shape

(12147, 6)

## removing the outlier for the price_per_sqft

In [42]:
df["price_per_sqft"].describe()

count     12147.000000
mean       6225.053321
std        4072.524858
min         267.829813
25%        4200.000000
50%        5274.725275
75%        6845.926914
max      176470.588235
Name: price_per_sqft, dtype: float64

In [43]:
def remove_outlier_sqft(df1):
    df_output=pd.DataFrame()
    for key,subdf in df.groupby("location"):
        m=np.mean(subdf["price_per_sqft"])
        st=np.std(subdf["price_per_sqft"])
        
        gen_df=subdf[((subdf["price_per_sqft"]> m-st) & (subdf["price_per_sqft"] <+ (m+st)))]
        df_output=pd.concat([df_output,gen_df], ignore_index=True)
    return df_output


df=remove_outlier_sqft(df)
df.describe()
    

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,9902.0,9902.0,9902.0,9902.0,9902.0
mean,1504.658553,2.463745,91.190201,2.561705,5644.231486
std,877.522553,0.965232,88.405381,0.877728,2242.443504
min,350.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4246.105574
50%,1290.0,2.0,67.0,2.0,5176.359705
75%,1650.0,3.0,100.0,3.0,6410.25641
max,30400.0,16.0,2200.0,16.0,24509.803922


In [44]:
def bhk_outlier_remover(df):
    exclude_indices= np.array([])
    for location,location_df in df.groupby("location"):
        bhk_stats= {}
        for bhk,bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk]={
                    "mean":np.mean(bhk_df.price_per_sqft),
                    "stf":np.std(bhk_df.price_per_sqft),
                    "count":bhk_df.shape[0]
                }

                
            for bhk,bhk_df in location_df.groupby("bhk"):
                stats=bhk_stats.get(bhk-1)
                if stats and stats["count"]>5:
                    exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats["mean"])].index.values)
                    
    return df.drop(exclude_indices,axis="index")
                

In [45]:
df=bhk_outlier_remover(df)

In [46]:
df.shape

(7105, 6)

In [47]:
df

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...
9893,other,1200.0,2.0,70.0,2,5833.333333
9894,other,1800.0,1.0,200.0,1,11111.111111
9897,other,1353.0,2.0,110.0,2,8130.081301
9898,other,812.0,1.0,26.0,1,3201.970443


In [48]:
df.to_csv("cleaned_data.csv")

In [49]:
df.skew(numeric_only=True)

total_sqft        10.740167
bath               2.607347
price              6.831899
bhk                2.331129
price_per_sqft     2.055605
dtype: float64

In [50]:
df.dtypes

location           object
total_sqft        float64
bath              float64
price             float64
bhk                 int32
price_per_sqft    float64
dtype: object

In [51]:
x=df.drop(columns=["price"])
y=df.price

In [52]:
print(x)
print("...........................................................................................")
print(y)

                 location  total_sqft  bath  bhk  price_per_sqft
0     1st Block Jayanagar      2850.0   4.0    4    15017.543860
1     1st Block Jayanagar      1630.0   3.0    3    11901.840491
2     1st Block Jayanagar      1875.0   2.0    3    12533.333333
3     1st Block Jayanagar      1200.0   2.0    3    10833.333333
4     1st Block Jayanagar      1235.0   2.0    2    11983.805668
...                   ...         ...   ...  ...             ...
9893                other      1200.0   2.0    2     5833.333333
9894                other      1800.0   1.0    1    11111.111111
9897                other      1353.0   2.0    2     8130.081301
9898                other       812.0   1.0    1     3201.970443
9901                other      3600.0   5.0    4    11111.111111

[7105 rows x 5 columns]
...........................................................................................
0       428.0
1       194.0
2       235.0
3       130.0
4       148.0
        ...  
9893     70.0
9894 

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [54]:
print(x_train.shape,x_test.shape)

(5684, 5) (1421, 5)


## Applying the Linear Regression

In [55]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False),["location"]),remainder="passthrough")

In [56]:
scaler=StandardScaler()

In [57]:
lr=LinearRegression()

In [58]:
pipe=make_pipeline(column_trans,scaler,lr)

In [59]:
pipe.fit(x_train,y_train)



In [60]:
y_ptrd_lr=pipe.predict(x_test)

In [61]:
r2_score(y_test,y_ptrd_lr)

0.8553992961197286

## Applying Lasso Regression

In [62]:
lasso=Lasso()

In [63]:
pipe=make_pipeline(column_trans,scaler,lasso)

In [64]:
pipe.fit(x_train,y_train)



In [65]:
y_predict_lasso=pipe.predict(x_test)

In [66]:
r2_score(y_test,y_predict_lasso)

0.8467316970248024

## Applying the Ridge

In [67]:
ridge=Ridge()

In [68]:
pipe=make_pipeline(column_trans,scaler,ridge)

In [69]:
pipe.fit(x_train,y_train)



In [70]:
ridge_test=pipe.predict(x_test)

In [71]:
r2_score(y_test,ridge_test)

0.8554939148884615