## Mumbai House Price Prediction using Linear Regression

In [79]:
# import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim

In [80]:
# obtain the raw csv 
data = "./data/mumbai_house_prices.csv"
house_price = pd.read_csv(data)

In [81]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         76038 non-null  int64  
 1   type        76038 non-null  object 
 2   locality    76038 non-null  object 
 3   area        76038 non-null  int64  
 4   price       76038 non-null  float64
 5   price_unit  76038 non-null  object 
 6   region      76038 non-null  object 
 7   status      76038 non-null  object 
 8   age         76038 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.2+ MB
None


In [82]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [83]:
# remove columns locality and region since they do not hold with the idea of linear regression
house_price.drop(['locality', 'region'], axis=1, inplace=True)

In [84]:
house_price.shape

(76038, 7)

In [85]:
# print all unique values of categorical columns
print(house_price.type.unique())
print(house_price.age.unique())
print(house_price.status.unique())

['Apartment' 'Villa' 'Studio Apartment' 'Independent House' 'Penthouse']
['New' 'Resale' 'Unknown']
['Ready to move' 'Under Construction']


In [86]:
# dealing with categorical data
house_price["type"].replace({"Studio Apartment":0, "Apartment":1, "Independent House":2, "Villa": 3, "Penthouse": 4}, inplace=True)
house_price["age"].replace({"New":0, "Resale":1, "Unknown": 0}, inplace=True)
house_price["status"].replace({"Ready to move":0, "Under Construction":1}, inplace=True)

In [87]:
house_price.head()

Unnamed: 0,bhk,type,area,price,price_unit,status,age
0,3,1,685,2.5,Cr,0,0
1,2,1,640,52.51,L,1,0
2,2,1,610,1.73,Cr,1,0
3,2,1,876,59.98,L,1,0
4,2,1,659,94.11,L,1,0


In [88]:
# calculating price in Lakhs for each using price and price_unit
for i in range(len(house_price)):  
    price_unit = house_price.loc[i, "price_unit"]

    if price_unit == "L":
        house_price.at[i, "price"] = (house_price.at[i, "price"] / 100)
  
house_price.drop(["price_unit"], axis=1, inplace=True)

In [89]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age
0,3,1,685,2.5,0,0
1,2,1,640,0.5251,1,0
2,2,1,610,1.73,1,0
3,2,1,876,0.5998,1,0
4,2,1,659,0.9411,1,0


In [90]:
# split into train and test dataset
test_ratio = 0.1
test_size = int(test_ratio*len(house_price))
test_indices = house_price.sample(test_size).index
X_train = house_price.drop(test_indices)
X_test = house_price.loc[test_indices]
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = X_train.pop("price").tolist()
y_test = X_test.pop("price").tolist()
print("The size of X_train is: ", X_train.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_test is: ", len(y_test))
print(X_train.head())
print(y_train[0:5])
print(X_test.head())
print(y_test[0:5])

The size of X_train is:  (68435, 5)
The size of X_test is:  (7603, 5)
The size of y_train is:  68435
The size of y_test is:  7603
   bhk  type  area  status  age
0    3     1   685       0    0
1    2     1   640       1    0
2    2     1   610       1    0
3    2     1   876       1    0
4    2     1   659       1    0
[2.5, 0.5251, 1.73, 0.5998, 0.9411]
   bhk  type  area  status  age
0    3     1  1420       1    0
1    1     1   610       0    1
2    1     1   550       0    0
3    3     1  1159       1    0
4    1     1   684       0    1
[2.25, 0.38, 0.26, 2.25, 0.325]


In [91]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# print(X_mean, X_std)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

        bhk      type      area    status       age
0  1.066462  0.028621 -0.506048 -0.831494 -0.666097
1 -0.016409  0.028621 -0.573270  1.202655 -0.666097
2 -0.016409  0.028621 -0.618086  1.202655 -0.666097
3 -0.016409  0.028621 -0.220724  1.202655 -0.666097
4 -0.016409  0.028621 -0.544888  1.202655 -0.666097


In [94]:
W = np.zeros((X_train.shape[1], 1))
b = 0
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)

In [95]:
def fit(X, y, W, b, learning_rate = 0.01, num_iterations = 1000):
    m = X.shape[0]

    for i in range(num_iterations):
        Z = np.dot(X, W) + b
        diff = Z - y
        dw = (1/m) * np.dot(X.T, diff)
        db = (1/m) * np.sum(diff)

        W = W - learning_rate * dw
        b = b - learning_rate * db
        
        if i % 100 == 0:
            loss = (1/m) * np.sum(np.abs(diff))
            print("Loss after iteration %i: %f" %(i, loss))
        
    return W, b

In [96]:
W, b = fit(X_train, y_train, W, b)

Loss after iteration 0: 1.683023
Loss after iteration 100: 0.858385
Loss after iteration 200: 0.776200
Loss after iteration 300: 0.760707
Loss after iteration 400: 0.758236
Loss after iteration 500: 0.758177
Loss after iteration 600: 0.758719
Loss after iteration 700: 0.759449
Loss after iteration 800: 0.760275
Loss after iteration 900: 0.761099


In [97]:
print(W)
print(b)

[[ 0.27081373]
 [-0.03391198]
 [ 1.47639864]
 [ 0.24035162]
 [ 0.11277351]]
1.6829499588961028


In [98]:
def predict(X, W, b):
    z = np.dot(X, W) + b
    # For all negative values, replace them with 0
    z[z < 0] = 0
    return z

price_pred = predict(X_test.loc[0], W, b)
print(price_pred)
print(y_test[0])

[3.05865121]
[2.25]


In [99]:
# Calculate the mean absolute error on training data in Cr
z = predict(X_train, W, b)
mae = np.sum(np.abs(z - y_train)) / len(y_train)
print(mae)

0.7577421374818527


In [100]:
# Calculate the mean absolute error on testing data in Cr
z = predict(X_test, W, b)
mae = np.sum(np.abs(z - y_test)) / len(y_test)
print(mae)

0.7507560010771124


In [101]:
# Testing with our own data
X_test.loc[0, "bhk"] = 2
X_test.loc[0, "type"] = 2
X_test.loc[0, "area"] = 720
X_test.loc[0, "status"] = 0
X_test.loc[0, "age"] = 1

# Normalizing the sample
X_test.loc[0] = (X_test.loc[0] - X_mean) / X_std

z = predict(X_test.loc[0], W, b)
print("The price of house in Cr: ", z)

The price of house in Cr:  [0.76483971]
