### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Getting the data

In [2]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.shape

(13320, 9)

In [6]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [7]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
for column in df.columns:
    print(df[column].value_counts())
    print('++'*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
++++++++++++++++++++++++++++++++++++++++
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
++++++++++++++++++++++++++++++++++++++++
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
+++++++++++++

In [9]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

**Columns to be dropped**

In [10]:
df2 = df.drop(columns=['availability','society','balcony'],axis=1)

In [11]:
df2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
df2.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13319 non-null  object 
 2   size        13304 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13247 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [14]:
df2.isnull().sum()

area_type      0
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

**Dropping NaN rows**

In [15]:
df3 = df2.dropna()

In [16]:
df3.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [17]:
df3.shape

(13246, 6)

**Creating a new column bhk having only the integer**

In [18]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split()[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split()[0]))


In [19]:
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2


**Houses with more than 20 bhk**

In [20]:
df3[df3.bhk > 20]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
1718,Super built-up Area,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Plot Area,Munnekollal,43 Bedroom,2400,40.0,660.0,43


**Converting the range in total_sqft to single value**

In [21]:
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
# Function to convert the range of total_sqft

def convert_range(x):
    num = x.split('-')
    
    if len(num) == 2:
        return (float(num[0]) + float(num[1]))/2
    try:
        return float(x)
    except:
        return None

In [23]:
df3['total_sqft'] = df3['total_sqft'].apply(convert_range)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['total_sqft'] = df3['total_sqft'].apply(convert_range)


In [24]:
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [25]:
df3.isnull().sum()

area_type      0
location       0
size           0
total_sqft    46
bath           0
price          0
bhk            0
dtype: int64

In [26]:
df3.shape

(13246, 7)

**Dropping na values**

In [27]:
df4 = df3.dropna()

In [28]:
df4.shape

(13200, 7)

In [29]:
df4.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [30]:
df4.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


**Adding Price per sqft column**

In [31]:
df4['price_per_sqft'] = df4['price']*100000 / df4['total_sqft']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['price_per_sqft'] = df4['price']*100000 / df4['total_sqft']


In [32]:
df4.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


**Location column Outliers**

In [33]:
df4['location'] = df4['location'].apply(lambda x: x.strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['location'] = df4['location'].apply(lambda x: x.strip())


In [34]:
location_count = df4.location.value_counts()

In [35]:
location_count

Whitefield                   533
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Rajanna Layout                 1
Subramanyanagar                1
Lakshmipura Vidyaanyapura      1
Malur Hosur Road               1
Abshot Layout                  1
Name: location, Length: 1287, dtype: int64

**Making location as 'other' if location_count is less than 10**

In [36]:
location_count_lessthan_10 = location_count[location_count <= 10]

In [37]:
location_count_lessthan_10

BTM 1st Stage                10
Gunjur Palya                 10
Nagappa Reddy Layout         10
Sector 1 HSR Layout          10
Thyagaraja Nagar             10
                             ..
Rajanna Layout                1
Subramanyanagar               1
Lakshmipura Vidyaanyapura     1
Malur Hosur Road              1
Abshot Layout                 1
Name: location, Length: 1047, dtype: int64

In [38]:
df4['location'] = df4['location'].apply(lambda x: 'other' if x in location_count_lessthan_10 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['location'] = df4['location'].apply(lambda x: 'other' if x in location_count_lessthan_10 else x)


In [39]:
df4.location.value_counts()

other              2872
Whitefield          533
Sarjapur  Road      392
Electronic City     304
Kanakpura Road      264
                   ... 
Doddaballapur        11
Tindlu               11
Marsur               11
HAL 2nd Stage        11
Kodigehalli          11
Name: location, Length: 241, dtype: int64

**price per bhk outliers**

In [40]:
df4.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.759
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4267.701
50%,1275.0,2.0,71.85,3.0,5438.331
75%,1672.0,3.0,120.0,3.0,7317.073
max,52272.0,40.0,3600.0,43.0,12000000.0


In [41]:
# We can see that there is a flat with a BHK of 0.25 sqft

(df4['total_sqft']/df4['bhk']).describe()

count    13200.000000
mean       573.847262
std        388.079980
min          0.250000
25%        473.000000
50%        552.000000
75%        625.000000
max      26136.000000
dtype: float64

In [42]:
# Flats with BHK of less than 300sqft

df4[((df4['total_sqft']/df4['bhk']) <= 300)]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,Plot Area,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,Plot Area,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Plot Area,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.980810
68,Plot Area,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,Plot Area,other,3 Bedroom,500.0,3.0,100.0,3,20000.000000
...,...,...,...,...,...,...,...,...
13281,Plot Area,Margondanahalli,5 Bedroom,1375.0,5.0,125.0,5,9090.909091
13300,Plot Area,Hosakerehalli,5 Bedroom,1500.0,6.0,145.0,5,9666.666667
13303,Plot Area,Vidyaranyapura,5 Bedroom,774.0,5.0,70.0,5,9043.927649
13306,Plot Area,other,4 Bedroom,1200.0,5.0,325.0,4,27083.333333


In [43]:
# Taking only flats with BHK more the 300sqft

df5 = df4[((df4['total_sqft']/df4['bhk']) >= 300)]

In [44]:
df5.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12456.0,12456.0,12456.0,12456.0,12456.0
mean,1590.189927,2.562781,111.18796,2.649004,6308.502826
std,1260.404795,1.080275,152.203367,0.976046,4168.127339
min,300.0,1.0,9.0,1.0,267.829813
25%,1115.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [45]:
df5.shape

(12456, 8)

**Price per sqft outliers**

In [46]:
df5.price_per_sqft.describe()

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [47]:
# function to remove price_per_sqft outlier

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [48]:
df6 = remove_pps_outliers(df5)

In [49]:
df6.shape

(10242, 8)

In [50]:
df6.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10242.0,10242.0,10242.0,10242.0,10242.0
mean,1504.023111,2.474419,90.991421,2.572642,5657.735993
std,876.798065,0.981908,86.147833,0.897242,2266.368844
min,300.0,1.0,10.0,1.0,1250.0
25%,1108.0,2.0,49.0,2.0,4244.796706
50%,1282.0,2.0,67.0,2.0,5172.846776
75%,1650.0,3.0,100.0,3.0,6426.000592
max,30400.0,16.0,2200.0,16.0,24509.803922


**bhk outliers**

In [51]:
# function to remove bhk outlier

def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [52]:
df7 = remove_bhk_outliers(df6)

In [53]:
df7.shape

(7317, 8)

In [54]:
df7.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Super built-up Area,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,Super built-up Area,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,Super built-up Area,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,Built-up Area,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,Super built-up Area,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


**Removing size and price_per_sqft columns**

**Cleaned Data**

In [55]:
# Cleaned data

df8 = df7.drop(['size','price_per_sqft'],axis=1)

In [56]:
df8.head()

Unnamed: 0,area_type,location,total_sqft,bath,price,bhk
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,428.0,4
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,194.0,3
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,235.0,3
3,Built-up Area,1st Block Jayanagar,1200.0,2.0,130.0,3
4,Super built-up Area,1st Block Jayanagar,1235.0,2.0,148.0,2


In [57]:
df8.isnull().sum()

area_type     0
location      0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [58]:
df8.to_csv("Cleaned Bengaluru housing data.csv")

**Applying get_dummies for location and area_type**

In [59]:
location_dummies = pd.get_dummies(df8.location)

In [60]:
location_dummies.head()

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
area_dummies = pd.get_dummies(df8.area_type)

In [62]:
area_dummies.head()

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,1,0,0,0
4,0,0,0,1


In [63]:
df9 = pd.concat([df8,location_dummies,area_dummies],axis=1)

In [64]:
df9.head()

Unnamed: 0,area_type,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,428.0,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,194.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,235.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Built-up Area,1st Block Jayanagar,1200.0,2.0,130.0,3,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Super built-up Area,1st Block Jayanagar,1235.0,2.0,148.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [65]:
df9.describe()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
count,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,...,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0,7317.0
mean,1493.516501,2.452098,98.839331,2.499932,0.000957,0.002187,0.001093,0.00082,0.00082,0.00328,...,0.001503,0.01189,0.00328,0.001093,0.005057,0.156348,0.180812,0.00574,0.083778,0.729671
std,860.566085,1.01509,93.090156,0.926439,0.030917,0.046714,0.03305,0.028626,0.028626,0.057181,...,0.038747,0.108399,0.057181,0.03305,0.070935,0.36321,0.384888,0.075551,0.277073,0.44416
min,300.0,1.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1096.0,2.0,50.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1260.0,2.0,73.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1680.0,3.0,112.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,30000.0,16.0,2200.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**Final DataFrame**

In [66]:
final_df = df9.drop(['location','area_type'],axis=1)

In [67]:
final_df

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10233,1200.0,2.0,70.0,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
10234,1800.0,1.0,200.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
10237,1353.0,2.0,110.0,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
10238,812.0,1.0,26.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


### Train Test Split

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X = final_df.drop(['price'],axis=1)
y = final_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

## Applying different Regression Models

### Linear Regression Model

In [70]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
lr_pred

array([107.14648438, 784.92401123, 305.36523438, ...,  38.28399658,
       -24.64819336,  96.84222412])

### Lasso Regression Model

In [71]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X_train,y_train)
lasso_pred = lasso.predict(X_test)
lasso_pred

array([102.75777696, 217.3963742 , 325.30368517, ...,  41.43993613,
        57.55154333,  86.20835434])

### Random Forest Regression Model

In [72]:
from sklearn.ensemble import RandomForestRegressor
rr = RandomForestRegressor()
rr.fit(X_train,y_train)
rr_pred = rr.predict(X_test)
rr_pred

array([ 88.7575    , 524.52666667, 209.465     , ...,  41.11014286,
        47.45857857,  81.86669311])

### Ridge Regression Model

In [73]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train,y_train)
ridge_pred = ridge.predict(X_test)
ridge_pred

array([106.79617476, 725.95827759, 300.03294953, ...,  39.55443416,
         8.10138404,  96.46124914])

### Support Vector Regression

In [74]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train,y_train)
svr_pred = svr.predict(X_test)
svr_pred

array([ 78.53915524, 218.18554725, 256.69815138, ...,  42.78430649,
        53.28220978,  62.82205369])

### Model Evaluation

In [75]:
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error,mean_squared_error

In [76]:
scores_df = pd.DataFrame({
    'Linear Regression':[r2_score(y_test,lr_pred),mean_absolute_error(y_test,lr_pred),mean_squared_error(y_test,lr_pred),np.sqrt(mean_squared_error(y_test,lr_pred))],
    'Lasso Regression':[r2_score(y_test,lasso_pred),mean_absolute_error(y_test,lasso_pred),mean_squared_error(y_test,lasso_pred),np.sqrt(mean_squared_error(y_test,lasso_pred))],
    'Random Forest Regression':[r2_score(y_test,rr_pred),mean_absolute_error(y_test,rr_pred),mean_squared_error(y_test,rr_pred),np.sqrt(mean_squared_error(y_test,rr_pred))],
    'Ridge Regression':[r2_score(y_test,ridge_pred),mean_absolute_error(y_test,ridge_pred),mean_squared_error(y_test,ridge_pred),np.sqrt(mean_squared_error(y_test,ridge_pred))],
    'Support Vector Regression':[r2_score(y_test,svr_pred),mean_absolute_error(y_test,svr_pred),mean_squared_error(y_test,svr_pred),np.sqrt(mean_squared_error(y_test,svr_pred))]
}, index=['R2_SCORE','MAE','MSE','RMSE'])

scores_df

Unnamed: 0,Linear Regression,Lasso Regression,Random Forest Regression,Ridge Regression,Support Vector Regression
R2_SCORE,0.841862,0.719534,0.84973,0.846255,0.627445
MAE,18.825612,23.851742,15.549865,17.962313,24.846711
MSE,1131.73965,2007.202565,1075.429611,1100.297748,2666.249219
RMSE,33.641338,44.801814,32.793743,33.170736,51.635736


In [77]:
d = {
    'Linear Regression':lr_pred,
    'Lasso Regression':lasso_pred,
    'Random Forest Regression':rr_pred,
    'Ridge Regression':ridge_pred,
    'Support Vector Regression':svr_pred
}

In [78]:
for i,j in d.items():
    print(i,':')
    print('\tExplained Variance Score:',explained_variance_score(y_test,j))
    print('\tr2 Score:',r2_score(y_test,j))
    print('\tMAE:',mean_absolute_error(y_test,j))    
    print('\tMSE:',mean_squared_error(y_test,j))
    print('\tRMSE:',np.sqrt(mean_squared_error(y_test,j)))
    print('\tPERFORMANCE IN PERCENTAGE:',round(r2_score(y_test,j)*100, 3),'%')

Linear Regression :
	Explained Variance Score: 0.8418744094074885
	r2 Score: 0.8418620645448485
	MAE: 18.8256115522541
	MSE: 1131.7396497639174
	RMSE: 33.64133840625128
	PERFORMANCE IN PERCENTAGE: 84.186 %
Lasso Regression :
	Explained Variance Score: 0.7196373864458516
	r2 Score: 0.7195336667734014
	MAE: 23.851741507564103
	MSE: 2007.2025654240358
	RMSE: 44.801814309512466
	PERFORMANCE IN PERCENTAGE: 71.953 %
Random Forest Regression :
	Explained Variance Score: 0.8502599834034764
	r2 Score: 0.8497302639658014
	MAE: 15.54986470289559
	MSE: 1075.4296111175145
	RMSE: 32.79374347520445
	PERFORMANCE IN PERCENTAGE: 84.973 %
Ridge Regression :
	Explained Variance Score: 0.8462808569088609
	r2 Score: 0.8462554402281913
	MAE: 17.962313342494852
	MSE: 1100.2977478392659
	RMSE: 33.17073631741186
	PERFORMANCE IN PERCENTAGE: 84.626 %
Support Vector Regression :
	Explained Variance Score: 0.6406546810905848
	r2 Score: 0.6274451045636702
	MAE: 24.84671095566687
	MSE: 2666.249218856925
	RMSE: 51.635

### From the above evaluation we can see that the Random Forest Regression Model performed the best

In [79]:
# DO NOT RUN CODE - TIME TO EXECUTE = 50 MINUTES

# dic = dict()
# for i in range(1,301):
#     rr = RandomForestRegressor(n_estimators=i,random_state=101)
#     rr.fit(X_train,y_train)
#     rr_pred = rr.predict(X_test)
#     dic[r2_score(y_test,rr_pred)] = i
    
# print(max(dic.items()))

# AFTER RUNNING THIS CODE IT IS FOUND THAT n_estimators=145 AND random_state=101 GIVES THE BEST RANDOM FOREST REGRESSOR MODEL

In [80]:
rr = RandomForestRegressor(n_estimators=145,random_state=101)
rr.fit(X_train,y_train)
rr_pred = rr.predict(X_test)

In [81]:
print('\tExplained Variance Score:',explained_variance_score(y_test,rr_pred))
print('\tr2 Score:',r2_score(y_test,rr_pred))
print('\tMAE:',mean_absolute_error(y_test,rr_pred))    
print('\tMSE:',mean_squared_error(y_test,rr_pred))
print('\tRMSE:',np.sqrt(mean_squared_error(y_test,rr_pred)))
print('\tPERFORMANCE IN PERCENTAGE:',round(r2_score(y_test,rr_pred)*100, 3),'%')

	Explained Variance Score: 0.8515963838996697
	r2 Score: 0.8510904969958224
	MAE: 15.568760841650644
	MSE: 1065.6948839721115
	RMSE: 32.64498252369132
	PERFORMANCE IN PERCENTAGE: 85.109 %


### Testing

In [82]:
'''Building a function to convert the string inputs into numbers for the RandomForestRegressor model and printing the 
predicted price'''

def predict_price(total_sqft,bath,bhk,location,area_type):
    
    loc_index = np.where(final_df.drop('price',axis=1).columns == location)[0][0]
    area_index = np.where(final_df.drop('price',axis=1).columns == area_type)[0][0]
    
    x = np.zeros(len(final_df.drop('price',axis=1).columns))
    x[0] = total_sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
    
    if area_index >= 0:
        x[area_index] = 1
    
    return round(rr.predict([x])[0], 3)

In [83]:
# Function to convert location into 'other' if it is not is in list of locations

def check_location():
    location = input('\nEnter LOCATION: ')
    
    if location in final_df.columns[4:-5]:
        return location
    else:
        return 'other'

In [84]:
# Function to check if entered area type is valid or not    
def check_area_type():
    area_type = input('\nPick AREA TYPE from - Super built-up  Area, Built-up  Area, Carpet  Area, Plot  Area : \n')
    
    if area_type in final_df.columns[-4:]:
        return area_type
    else:
        return None

In [85]:
# Code to predict the house price
    
if __name__ == "__main__":
    
    total_sqft = float(input('Enter the TOTAL SQUARE FEET: '))
    bath = int(input('\nEnter the total number of BATHROOMS: '))
    bhk = int(input('\nEnter the total number of BEDROOMS: '))
    location = check_location()
    area_type = check_area_type()
    
    print('\n'+'*'*20)
    print('AMOUNT PREDICTED:\n')
    print(predict_price(total_sqft, bath, bhk, location, area_type),'LAKHS')
    print('*'*20)

Enter the TOTAL SQUARE FEET: 2850

Enter the total number of BATHROOMS: 4

Enter the total number of BEDROOMS: 4

Enter LOCATION: 1st Block Jayanagar

Pick AREA TYPE from - Super built-up  Area, Built-up  Area, Carpet  Area, Plot  Area : 
Super built-up  Area

********************
AMOUNT PREDICTED:

400.324 LAKHS
********************


### Tested Values

In [86]:
print(predict_price(2850, 4, 4, '1st Block Jayanagar','Super built-up  Area'),'LAKHS')

400.324 LAKHS


In [87]:
print(predict_price(1630, 3, 3, '1st Block Jayanagar','Super built-up  Area'),'LAKHS')

157.727 LAKHS


In [88]:
print(predict_price(1875, 2, 3, '1st Block Jayanagar','Super built-up  Area'),'LAKHS')

196.514 LAKHS


In [89]:
print(predict_price(1875, 3, 3, '1st Phase JP Nagar','Super built-up  Area'),'LAKHS')

152.027 LAKHS


In [90]:
print(predict_price(1500, 5, 5, '1st Block Jayanagar','Built-up  Area'),'LAKHS')

124.646 LAKHS


In [91]:
print(predict_price(2065, 4, 3, '1st Block Jayanagar','Super built-up  Area'),'LAKHS')

175.458 LAKHS


In [92]:
print(predict_price(1450, 2, 3, '2nd Phase Judicial Layout','Super built-up  Area'),'LAKHS')

60.269 LAKHS


In [93]:
print(predict_price(1150, 2, 2, '2nd Phase Judicial Layout','Super built-up  Area'),'LAKHS')

44.0 LAKHS


In [94]:
print(predict_price(1500, 4, 4, '2nd Stage Nagarbhavi','Plot  Area'),'LAKHS')

210.851 LAKHS


In [95]:
print(predict_price(2400, 8, 6, '2nd Stage Nagarbhavi','Plot  Area'),'LAKHS')

339.69 LAKHS


In [96]:
print(predict_price(1206, 2, 2, '5th Block Hbr Layout','Super built-up  Area'),'LAKHS')

56.496 LAKHS


In [97]:
print(predict_price(1270, 2, 3, '5th Block Hbr Layout','Super built-up  Area'),'LAKHS')

68.47 LAKHS


In [98]:
print(predict_price(918, 2, 2, '7th Phase JP Nagar','Built-up  Area'),'LAKHS')

57.659 LAKHS


In [99]:
print(predict_price(850,2, 2, '7th Phase JP Nagar','Built-up  Area'),'LAKHS')

40.834 LAKHS


In [100]:
print(predict_price(750, 1,1, '7th Phase JP Nagar','Super built-up  Area'),'LAKHS')

43.865 LAKHS


In [101]:
print(predict_price(1000, 1, 1, '7th Phase JP Nagar','Plot  Area'),'LAKHS')

62.054 LAKHS


In [102]:
print(predict_price(500, 1, 1, '8th Phase JP Nagar','Carpet  Area'),'LAKHS')

29.349 LAKHS


In [103]:
print(predict_price(460, 1, 1, 'Attibele','Carpet  Area'),'LAKHS')

13.277 LAKHS


In [104]:
print(predict_price(1200, 2, 4, 'Banashankari Stage II','Plot  Area'),'LAKHS')

146.342 LAKHS


In [105]:
print(predict_price(1222, 2, 2, 'Banaswadi','Super built-up  Area'),'LAKHS')

60.604 LAKHS
