In [1]:

#Import Libraries


import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize']=(20,10)


__Pandas is defined as an open-source library that provides high-performance data manipulation in Python. It is built on top of the NumPy package, which means Numpy is required for operating the Pandas. The name of Pandas is derived from the word Panel Data, which means an Econometrics from Multidimensional data.__

__Before Pandas, Python was capable for data preparation, but it only provided limited support for data analysis. So, Pandas came into the picture and enhanced the capabilities of data analysis. It can perform five significant steps required for processing and analysis of data irrespective of the origin of the data, i.e., load, manipulate, prepare, model, and analyze.__

                                                    ***
__NumPy is mostly written in C language, and it is an extension module of Python. It is defined as a Python package used for performing the various numerical computations and processing of the multidimensional and single-dimensional array elements. The calculations using Numpy arrays are faster than the normal Python array.__



In [2]:
#Import Bengaluru Datatset

df1=pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
#Dataset Records and Features

df1.shape

(13320, 9)

In [4]:
# Pandas head() method is used to return top n rows of a data frame.

df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df1.head(13320)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [6]:
#total null value

df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
# Drop Unnecessary Features Like 'society'

df1=df1.drop("society",axis='columns')

In [8]:
#Unique values

df1['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [9]:
df1.isnull().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [10]:
# drop all null values

df2=df1.dropna()
df2.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12710 entries, 0 to 13319
Data columns (total 8 columns):
area_type       12710 non-null object
availability    12710 non-null object
location        12710 non-null object
size            12710 non-null object
total_sqft      12710 non-null object
bath            12710 non-null float64
balcony         12710 non-null float64
price           12710 non-null float64
dtypes: float64(3), object(5)
memory usage: 893.7+ KB


In [12]:
# Unique Values Of 'total_Sqft'

df2.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [13]:
#Define Function Checking Float Values For 'total_sqft'  

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [14]:
# Prints Ranges of that values which is not float

df2[~df2['total_sqft'].apply(is_float)].head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
30,Super built-up Area,19-Dec,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0
122,Super built-up Area,18-Mar,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0
137,Super built-up Area,19-Mar,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005
165,Super built-up Area,18-Dec,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49
188,Super built-up Area,Ready To Move,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8


In [15]:
#This Function Convert sqft to Number 

def convert_sqft_to_num(x):
    tokens=x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [16]:
#Average Value

convert_sqft_to_num('2100 - 2850')

2475.0

In [17]:
df3=df2.copy()
df3['total_sqft']=df3['total_sqft'].apply(convert_sqft_to_num)
df3.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200.0,2.0,1.0,51.0


In [18]:
df3.loc[15]

area_type       Super built-up  Area
availability           Ready To Move
location                 Mysore Road
size                           2 BHK
total_sqft                      1175
bath                               2
balcony                            2
price                           73.5
Name: 15, dtype: object

In [19]:
# add price_per_sqft from price/total_sqft

df7=df3.copy()
df7['price_per_sqft']=df7['price']*100000/df7['total_sqft']
df7.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,4250.0


In [20]:
# This function calculating mean and standard deviation

def remove_outlier(df5):
    new_dataframe = pd.DataFrame()
    for key, df6 in df5.groupby('location'):
        m = np.mean(df6.price_per_sqft)
        st = np.std(df6.price_per_sqft)
        reduced_df = df6[(df6.price_per_sqft>(m-st)) & (df6.price_per_sqft<=(m+st))]
        new_dataframe = pd.concat([new_dataframe,reduced_df],ignore_index=True)
    return new_dataframe
    

In [21]:
df7 = remove_outlier(df7)
df7.shape

(9515, 9)

In [22]:
df7 = remove_outlier(df7)

In [23]:
# Drop Unnecessary Features Like 'area_type','availability'

df7.drop(columns=['availability','area_type'],inplace = True)

__The strip() method returns a copy of the string by removing both the leading and the trailing characters (based on the string argument passed). The strip() method removes characters from both left and right based on the argument (a string specifying the set of characters to be removed).__

In [24]:
df7.location = df7.location.str.strip()
location_count = df7['location'].value_counts(ascending=False)

In [25]:
location_stats_less_than_9 = location_count[location_count<=9]

In [26]:
df7.location = df7.location.apply(lambda x: 'other' if x in location_stats_less_than_9 else x)

In [27]:
df7 = df7[df7.location != 'other']

In [28]:
df7.shape

(4993, 7)

In [29]:
df8 = df7.copy()

In [30]:
df8.shape

(4993, 7)

__A dummy variable is a variable that takes values of 0 and 1, where the values indicate the presence or absence of something . Where a categorical variable has more than two categories, it can be represented by a set of dummy variables, with one variable for each category. Numeric variables can also be dummy coded to explore nonlinear effects. Dummy variables are also known as indicator variables, design variables, contrasts, one-hot coding, and binary basis variables.__

In [31]:
location = pd.get_dummies(df8.location)

In [32]:
df8 = pd.concat([df8,location],axis='columns')

In [33]:
df8.shape

(4993, 161)

In [34]:
df5 = df8.drop('location',axis = 1)
df5 = df5.drop(columns=['balcony','price_per_sqft'])
df5

Unnamed: 0,size,total_sqft,bath,price,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Uttarahalli,Varthur,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur
26,4 BHK,2825.0,4.0,250.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27,3 BHK,1875.0,3.0,167.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,2 BHK,1394.0,2.0,100.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,2 BHK,1077.0,2.0,93.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,3 BHK,1590.0,3.0,131.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6247,6 Bedroom,2500.0,5.0,185.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6248,3 BHK,2500.0,3.0,138.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6249,2 BHK,1160.0,2.0,64.08,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6250,3 BHK,2503.0,3.0,138.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


__Lambda functions are defined using the keyword lambda. They can have any number of arguments but only one expression. A lambda function cannot contain any statements, and it returns a function object which can be assigned to any variable. They are generally used for one-line expressions.Regular functions are created using the def keyword. The split( ) method splits a string into a list.They can have any number of arguments and any number of expressions. They can contain any statements and are generally used for large blocks of code.__

In [35]:
df5['bhk'] = df5['size'].apply(lambda x: int(x.split(' ')[0]))
df5.bhk.unique()

array([4, 3, 2, 6, 5, 1, 8, 9, 7], dtype=int64)

In [36]:
df5['size'] = df5['size'].str[0:2]
df5['size']=df5['size'].astype('int')
df5

Unnamed: 0,size,total_sqft,bath,price,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Varthur,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur,bhk
26,4,2825.0,4.0,250.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
27,3,1875.0,3.0,167.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
28,2,1394.0,2.0,100.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
29,2,1077.0,2.0,93.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
30,3,1590.0,3.0,131.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6247,6,2500.0,5.0,185.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,6
6248,3,2500.0,3.0,138.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
6249,2,1160.0,2.0,64.08,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2
6250,3,2503.0,3.0,138.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3


In [37]:
# dropping the price

x=df5.drop("price",axis=1)
y=df5["price"]

In [38]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)

__The train-test split procedure is used to estimate the performance of machine learning algorithms when they are used to make predictions on data not used to train the mode__
                                                        ***
__train_test_split is a function in Sklearn model selection for splitting data arrays into two subsets: for training data and for testing data. With this function, you don't need to divide the dataset manually. By default, Sklearn train_test_split will make random partitions for the two subsets.__

1]__Supervised Learning:__

__In supervised learning the data is in labeled and the aim of this methodology is for the rule to be able to “learn” by comparison its actual output with the“taught” outputs to search out errors, and modify the model consequently. supervised learning souses patterns to predict label values on extra unlabeled knowledge. Here the example of supervised machine learning algorithms is: Linear regression for regression issues. Random forest for classification and regression issues. Support vector machines for classification issues.__

                                                    ***
2]__Unsupervised Learning:__

__Unsupervised learning may be a style of machine learning algorithmic program wont to draw inferences from datasets consisting of computer file while not labelled responses. The most common unsupervised learning methodology is cluster analysis, that is employed for explorative information analysis to seek out hidden patterns or grouping in information.__


In [39]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.4)


1] __Multilayer perceptron (MLP)__

__A multilayer perceptron (MLP) may be a category of feedforward artificial neural network (ANN). MLP utilizes a supervised learning technique known as backpropagation for coaching. Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It will distinguish information that's not linearly dissociable. MLP is use for Tabular datasets Classification prediction problems Regression prediction problems.__


In [40]:
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(xtrain, xtest, ytrain, ytest)

100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:51<00:00,  1.37s/it]


In [41]:
models

Unnamed: 0_level_0,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MLPRegressor,0.92,16.17,9.13
XGBRegressor,0.92,16.22,1.67
BayesianRidge,0.92,17.06,0.23
RidgeCV,0.91,17.08,0.27
TransformedTargetRegressor,0.91,17.1,0.19
LinearRegression,0.91,17.1,0.21
Ridge,0.91,17.1,0.09
Lars,0.91,17.11,0.21
LassoCV,0.91,17.11,0.58
LarsCV,0.91,17.12,0.62


In [42]:
#Accuracy Using MLP

from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=1000).fit(xtrain, ytrain)
print("Train Accuracy ",regr.score(xtrain,ytrain)*100)
print("Test Accuracy ",regr.score(xtest, ytest)*100)

Train Accuracy  97.0452922652164
Test Accuracy  94.6028991811611
