In [2]:
# Importing Librabry which we want

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [3]:
# importing dataset

df = pd.read_csv('data.csv')
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [4]:
# Shape of the dataset

df.shape

(13320, 9)

In [5]:
# basic info of dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
# Info of each column

for c in df.columns:
    print(df[c].value_counts())
    print('----------------------')

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
----------------------
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
----------------------
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
----------------------
size
2 B

In [7]:
#

df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
# drop column that might mislead our model

df = df.drop(columns=['area_type','availability','society','balcony'])
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [9]:
# Collect stat of data

df.describe(include='all')

Unnamed: 0,location,size,total_sqft,bath,price
count,13319,13304,13320.0,13247.0,13320.0
unique,1305,31,2117.0,,
top,Whitefield,2 BHK,1200.0,,
freq,540,5199,843.0,,
mean,,,,2.69261,112.565627
std,,,,1.341458,148.971674
min,,,,1.0,8.0
25%,,,,2.0,50.0
50%,,,,2.0,72.0
75%,,,,3.0,120.0


In [10]:
# Finding most repeted value in Location and fill with it

df['location'].value_counts()
df['location'] = df['location'].fillna('Whitefield')

In [11]:
# Finding most repeted dvalue in Size and fill with it

df['size'].value_counts()
df['size'] = df['size'].fillna('2 BHK')

In [12]:
# Finding most repeted dvalue in Bath and fill with it

df['bath'].value_counts()
df['bath'] = df['bath'].fillna(2.0)

In [13]:
# finding values in total sqrt

df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [14]:
# some value has interval like x-y so we take mean of that value and convert into float too . and which has no float value we'll remove it 

def convert(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try :
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert)
df = df.dropna(subset=['total_sqft'])

In [15]:
# now remove bhk and bedroom from size and store to new column named bhk

df['bhk']  = df['size'].str.split().str.get(0).astype(int)
df = df.drop('size',axis =1)

In [16]:
# remove whitespace so that some same value matched again and category become small

df['location'] = df['location'].str.strip()
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   304
Kanakpura Road                    271
Thanisandra                       236
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
Kannur                              1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1288, dtype: int64

In [17]:
# now we give name other to location which value count is less than 10 so that misleading in model might get smaller 

location_value = df['location'].value_counts()
location_under_10 = location_value[location_value<10]
df['location'] = df['location'].apply(lambda x:'other' if x in location_under_10 else x)
df['location'].value_counts()

location
other                   2736
Whitefield               540
Sarjapur  Road           399
Electronic City          304
Kanakpura Road           271
                        ... 
Nagappa Reddy Layout      10
BTM 1st Stage             10
Basapura                  10
Sector 1 HSR Layout       10
Nagadevanahalli           10
Name: count, Length: 255, dtype: int64

In [18]:
# we get price per feet to find outlier in price  

df['price_per_feet'] = df.price / df.total_sqft

# function to find outlier

def detect_outliers_iqr(data):
    outliers = []
    data = sorted(data)
    q1 = np.percentile(data, 5)
    q3 = np.percentile(data, 95)
    
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

# Price Outlier
outliner_price = detect_outliers_iqr(df.price_per_feet)
print("Outliers from Price: ", outliner_price)

# BHK Outlier
outliner_bhk = detect_outliers_iqr(df.bhk)
print("Outliers from BHK: ", outliner_bhk)

# Bath Outlier
outliner_bath = detect_outliers_iqr(df.bath)
print("Outliers from Bath: ", outliner_bath)

Outliers from Price:  [0.34050179211469533, 0.3429090909090909, 0.3499579377478668, 0.35, 0.350424197713021, 0.35515695067264574, 0.35625, 0.3627450980392157, 0.37058152793614596, 0.375, 0.3888888888888889, 0.43333333333333335, 0.44, 0.47619047619047616, 0.4799465240641711, 0.48484848484848486, 0.5, 0.5034965034965035, 0.5111111111111111, 0.5208333333333334, 0.5315, 0.56, 0.5977011494252874, 0.7653061224489796, 0.7692307692307693, 1.7647058823529411, 2.0, 3.6333333333333333, 6.25, 6.7272727272727275, 23.0, 120.0]
Outliers from BHK:  [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 13, 14, 16, 18, 19, 27, 43]
Outliers from Bath:  [12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 13.0, 13.0, 13.0, 14.0, 15.0, 16.0, 16.0, 18.0, 27.0, 40.0]


In [19]:
# We remove Outlier that we find in Data but in price outlier  is huge in number so we remove after 0.75 only

bath_above_10 = df[df.bath >= 10]
df.drop(bath_above_10.index.values,axis='index',inplace=True)

bhk_above_10 = df[df.bhk >= 10]
df.drop(bhk_above_10.index.values,axis='index',inplace=True)

PPF_above_75 = df[df.price_per_feet > 0.75]
df.drop(PPF_above_75.index.values,axis='index',inplace=True)

df.drop('price_per_feet',axis=1,inplace=True)

In [20]:
# reset index

df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.00,4
2,Uttarahalli,1440.0,2.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,95.00,3
4,Kothanur,1200.0,2.0,51.00,2
...,...,...,...,...,...
13220,Whitefield,3453.0,4.0,231.00,5
13221,other,3600.0,5.0,400.00,4
13222,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2
13223,Padmanabhanagar,4689.0,4.0,488.00,4


In [21]:
# Saving Cleaned Data

df.to_csv('Cleaned_Data.csv')

In [22]:
# Split data in X and y

X = df.drop(columns='price')
y = df.price

In [23]:
# Apply Train test and Split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [24]:
# Creating an OneHotEncoder object to contain all the possible categories

ohe=OneHotEncoder()
column_trans=make_column_transformer((OneHotEncoder(sparse_output=False),['location']),
                                    remainder='passthrough')

In [25]:
# for standerization of Data

sc = StandardScaler()

In [26]:
# Model that we'll use

rfr = RandomForestRegressor()

In [27]:
# Making pipeline

pipe = make_pipeline(column_trans,sc,rfr)

# Fitting data in pipeline

pipe.fit(X_train,y_train)

In [28]:
# Finding y Pred

y_pred=pipe.predict(X_test)

In [29]:
# Cheking R2 Score

print(r2_score(y_test,y_pred))

0.6361652583313264


In [30]:
# Now Saving Model

pickle.dump(pipe,open('House_Price_Predicitor.pkl','wb'))