## Mumbai House Price Prediction using Linear Regression

In [34]:
# import all the required libraries
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim

In [45]:
# Obtain the raw csv
data = "../../assets/data/mumbai_house_prices.csv"
house_price = pd.read_csv(data)
house_price = house_price.sample(frac=0.2)

# Reset index
house_price.reset_index(drop=True, inplace=True)

print(house_price)


       bhk       type                                     locality  area  \
0        3  Apartment                              Rajhans Kshitij  1285   
1        1  Apartment                        Ariha Ariha Signature   416   
2        1  Apartment          Runwal Runwal Forests Tower 9 To 11   600   
3        4  Apartment                          Vasant Vasant Vihar  2200   
4        1  Apartment                     Dharti Riddheshwar Tower   671   
...    ...        ...                                          ...   ...   
15203    1  Apartment  MICL Aaradhya Highpark Project 1 Of Phase I   480   
15204    2  Apartment                             Welcome C G Park   700   
15205    4  Apartment               JP Codename Dream Home Tower C  1060   
15206    3  Apartment                           Kalpataru Parkcity  1150   
15207    1  Apartment                         Vihang Vihang Valley   550   

       price price_unit          region              status      age  
0      88.00    

In [46]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15208 entries, 0 to 15207
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         15208 non-null  int64  
 1   type        15208 non-null  object 
 2   locality    15208 non-null  object 
 3   area        15208 non-null  int64  
 4   price       15208 non-null  float64
 5   price_unit  15208 non-null  object 
 6   region      15208 non-null  object 
 7   status      15208 non-null  object 
 8   age         15208 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.0+ MB
None


In [47]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Rajhans Kshitij,1285,88.0,L,Vasai,Ready to move,Resale
1,1,Apartment,Ariha Ariha Signature,416,83.0,L,Goregaon West,Under Construction,New
2,1,Apartment,Runwal Runwal Forests Tower 9 To 11,600,95.0,L,Kanjurmarg,Under Construction,New
3,4,Apartment,Vasant Vasant Vihar,2200,2.25,Cr,Thane West,Ready to move,Unknown
4,1,Apartment,Dharti Riddheshwar Tower,671,58.0,L,Kamothe,Ready to move,Resale


In [48]:
# create a dictionary to map region to their latitude and longitude
geo = Nominatim(user_agent="Geopy Library", timeout=10)  # Adjust timeout value if error is raised
unique_regions = house_price["region"].unique()
print("Total number of unique values: ", len(unique_regions))
lat_long_dict = {}
unknown_regions = []

for r in unique_regions:
    loc = geo.geocode(r + ", Mumbai")
    if (loc == None):
        unknown_regions.append(r)
        continue
    else:
        latitude = loc.latitude
        longitude = loc.longitude
        lat_long_dict[r] = [latitude, longitude]

print("Geopy could not find the following regions: ", unknown_regions, len(unknown_regions))

# This cell might take about 2 minutes to execute as it is geocoding each region

Total number of unique values:  162
Geopy could not find the following regions:  ['Mira Road East', 'Ambernath East', 'Neral', 'Koper Khairane', 'Badlapur West', 'Bhayandar East', 'Ambernath West', 'Karjat', 'Ulhasnagar', 'Maneklal Estate', 'Ville Parle East', 'Nala Sopara', 'Karanjade', 'Patlipada', 'Badlapur East', 'Bhayandar West', 'Kasheli', 'Anjurdive', 'Dronagiri', 'Saphale', 'Taloje', 'Titwala', 'Koproli', 'Napeansea Road', 'Virar West', 'Usarghar Gaon', 'Nilje Gaon', 'Umroli', 'Vichumbe', 'Sector 17 Ulwe', 'Kewale', 'Virar East', 'Sector 19 Kharghar', 'Kalher', 'Sector 7 Kharghar', 'Vasai east', 'Owale', 'Palava', 'Vangani', 'Vevoor', 'Sector 20 Kamothe', 'Nalasopara East'] 42


In [49]:
# we remove all rows which contain unidentified regions and for all rows with identified regions, we add their latitiude and longitude to the data set.
del_idx = []
for i in range(len(house_price)):
    region = house_price.loc[i, "region"]
    if region in unknown_regions:
        del_idx.append(i)
    else:
        lat_long = lat_long_dict[region]
        house_price.at[i, "latitude"] = lat_long[0] 
        house_price.at[i, "longitude"] = lat_long[1]


In [50]:
house_price.drop(del_idx, inplace=True)
house_price = house_price.reset_index(drop=True)

In [51]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age,latitude,longitude
0,3,Apartment,Rajhans Kshitij,1285,88.0,L,Vasai,Ready to move,Resale,19.318937,72.897951
1,1,Apartment,Ariha Ariha Signature,416,83.0,L,Goregaon West,Under Construction,New,19.163328,72.8412
2,1,Apartment,Runwal Runwal Forests Tower 9 To 11,600,95.0,L,Kanjurmarg,Under Construction,New,19.129687,72.92837
3,4,Apartment,Vasant Vasant Vihar,2200,2.25,Cr,Thane West,Ready to move,Unknown,19.026011,73.010167
4,1,Apartment,Dharti Riddheshwar Tower,671,58.0,L,Kamothe,Ready to move,Resale,19.016434,73.080655


In [52]:
house_price.shape

(12308, 11)

In [53]:
# remove columns locality and region since they do not hold with the idea of linear regression
house_price.drop(['locality', 'region'], axis=1, inplace=True)

In [54]:
# print all unique values of categorical columns
print(house_price.type.unique())
print(house_price.age.unique())
print(house_price.status.unique())

['Apartment' 'Studio Apartment' 'Villa' 'Independent House' 'Penthouse']
['Resale' 'New' 'Unknown']
['Ready to move' 'Under Construction']


In [55]:
# dealing with categorical data
house_price["type"].replace({"Studio Apartment":0, "Apartment":0.2, "Independent House":0.4, "Villa": 0.6, "Penthouse": 1}, inplace=True)
house_price["age"].replace({"New":0, "Resale":1, "Unknown": 0.5}, inplace=True)
house_price["status"].replace({"Ready to move":0, "Under Construction":1}, inplace=True)

In [56]:
# calculating price in Crores for each using price and price_unit
for i in range(len(house_price)):  
    price_unit = house_price.loc[i, "price_unit"]

    if price_unit == "L":
        house_price.at[i, "price"] = (house_price.at[i, "price"] / 100)
  
house_price.drop(["price_unit"], axis=1, inplace=True)

In [57]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.2,1285,0.88,0,1.0,19.318937,72.897951
1,1,0.2,416,0.83,1,0.0,19.163328,72.8412
2,1,0.2,600,0.95,1,0.0,19.129687,72.92837
3,4,0.2,2200,2.25,0,0.5,19.026011,73.010167
4,1,0.2,671,0.58,0,1.0,19.016434,73.080655


In [58]:
import random

# Define the ratios for train, test, and validation sets
test_ratio = 0.2
val_ratio = 0.2

# Shuffle the indices of the dataset
indices = list(house_price.index)
random.shuffle(indices)

# Calculate the number of samples for the testing and validation sets
test_size = int(test_ratio * len(house_price))
val_size = int(val_ratio * len(house_price))

# Split the indices into training, testing, and validation sets
test_indices = indices[:test_size]
val_indices = indices[test_size:test_size+val_size]
train_indices = indices[test_size+val_size:]

# Create training, validation, and testing sets
X_train = house_price.loc[train_indices]
X_val = house_price.loc[val_indices]
X_test = house_price.loc[test_indices]

# Extract target variable
y_train = X_train.pop("price").tolist()
y_val = X_val.pop("price").tolist()
y_test = X_test.pop("price").tolist()

print("The size of X_train is: ", X_train.shape)
print("The size of X_val is: ", X_val.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_val is: ", len(y_val))
print("The size of y_test is: ", len(y_test))

The size of X_train is:  (7386, 7)
The size of X_val is:  (2461, 7)
The size of X_test is:  (2461, 7)
The size of y_train is:  7386
The size of y_val is:  2461
The size of y_test is:  2461


In [59]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_val = (X_val - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

            bhk      type      area    status       age  latitude  longitude
1884  -1.188607  0.016345 -0.659658 -0.836825  0.164917  1.195190   1.477070
12229 -0.130125  0.016345 -0.469614 -0.836825 -0.987473  0.718443  -0.048327
3862  -1.188607  0.016345 -0.639297  1.194993  0.164917 -0.541864  -0.626380
502   -1.188607  0.016345 -0.666446 -0.836825  1.317307 -0.851758   0.592144
815   -0.130125  0.016345 -0.537487 -0.836825 -0.987473 -1.380500   0.700361


In [28]:
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)
y_val = np.array(y_val)
y_val = y_val.reshape(y_val.shape[0], 1)

In [29]:
# Defining our kernel functions in terms of inner products
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

In [30]:
def polynomial_kernel(x, y, p=3):
    return (1 + np.dot(x, y)) ** p

In [31]:
def rbf_kernel(x, y, sigma=5.0):
    return np.exp(-(np.linalg.norm(x-y)**2) / (2 * (sigma**2)))

In [74]:
# Defining the kernel matrix
num_samples = len(X_train)

K = np.zeros((num_samples,num_samples))

for i in range(num_samples):
    for j in range(num_samples):
        K[i][j] = np.dot(X_train.loc[i], X_train.loc[j])
        break

KeyError: 2

In [None]:
def fit(X, y, W, b, learning_rate=0.1, num_iterations=1000, l2_lambda=0.01):
    m = X.shape[0]

    for i in range(num_iterations):
        Z = np.dot(X, W) + b
        diff = Z - y
        dw = (1/m) * np.dot(X.T, diff) + (l2_lambda / m) * W  # L2 regularization
        db = (1/m) * np.sum(diff)

        W = W - learning_rate * dw
        b = b - learning_rate * db
        
        if i % 100 == 0:
            loss = (1/m) * np.sum(np.abs(diff))
            # Add regularization term to loss
            regularization_loss = (l2_lambda / (2 * m)) * np.sum(np.square(W))
            total_loss = loss + regularization_loss
            print("Loss after iteration %i: %f" % (i, total_loss))
        
    return W, b

In [19]:
W = np.zeros((X_train.shape[1], 1))
b = 0
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)
y_val = np.array(y_val)
y_val = y_val.reshape(y_val.shape[0], 1)

In [74]:
def fit(X, y, W, b, learning_rate=0.1, num_iterations=1000, l2_lambda=0.01):
    m = X.shape[0]

    for i in range(num_iterations):
        Z = np.dot(X, W) + b
        diff = Z - y
        dw = (1/m) * np.dot(X.T, diff) + (l2_lambda / m) * W  # L2 regularization
        db = (1/m) * np.sum(diff)

        W = W - learning_rate * dw
        b = b - learning_rate * db
        
        if i % 100 == 0:
            loss = (1/m) * np.sum(np.abs(diff))
            # Add regularization term to loss
            regularization_loss = (l2_lambda / (2 * m)) * np.sum(np.square(W))
            total_loss = loss + regularization_loss
            print("Loss after iteration %i: %f" % (i, total_loss))
        
    return W, b

In [92]:
def predict(X, W, b):
    z = np.dot(X, W) + b
    # For all negative values, replace them with 0
    z[z < 0] = 0
    return z

In [76]:
# Tuning hyperparameters using validation set
W, b = fit(X_train, y_train, W, b, 0.1, 1000)

# Calculate the mean absolute error on validation data in Cr
z = predict(X_train, W, b)
mae = np.sum(np.abs(z - y_train)) / len(y_train)
print(mae)

# Reset parameters
W = np.zeros((X_train.shape[1], 1))
b = 0

Loss after iteration 0: 1.913733
Loss after iteration 100: 0.711913
Loss after iteration 200: 0.715573
Loss after iteration 300: 0.716325
Loss after iteration 400: 0.716437
Loss after iteration 500: 0.716453
Loss after iteration 600: 0.716455
Loss after iteration 700: 0.716456
Loss after iteration 800: 0.716456
Loss after iteration 900: 0.716456
0.6822603615645657


In [77]:
# Train on both training and validation data
W, b = fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)), W, b, 0.1, 1000)

Loss after iteration 0: 1.910651
Loss after iteration 100: 0.703065
Loss after iteration 200: 0.706368
Loss after iteration 300: 0.707122
Loss after iteration 400: 0.707236
Loss after iteration 500: 0.707253
Loss after iteration 600: 0.707256
Loss after iteration 700: 0.707256
Loss after iteration 800: 0.707256
Loss after iteration 900: 0.707256


In [78]:
# Calculate the mean absolute error on test data in Cr
z = predict(X_test, W, b)
mae = np.sum(np.abs(z - y_test)) / len(y_test)
print(mae)

0.6803586287923034


In [79]:
print(W)
print(b)

[[ 0.09478   ]
 [ 0.00430646]
 [ 1.69715741]
 [ 0.16633868]
 [-0.05334328]
 [-0.17034898]
 [-0.64124027]]
1.9132840473456896


In [97]:
# Testing with our own data
X_test.loc[0, "bhk"] = 2
X_test.loc[0, "type"] = 0.2
X_test.loc[0, "area"] = 594
X_test.loc[0, "age"] = 1
X_test.loc[0, "status"] = 0
X_test.loc[0, "latitude"] = 19.432762
X_test.loc[0, "longitude"] = 72.84564

# Normalizing the sample
X_test.loc[0] = (X_test.loc[0] - X_mean) / X_std

z = predict(X_test.loc[0], W, b)
print("The price of house in Cr: ", z)

The price of house in Cr:  [0.68984355]
