## Mumbai House Price Prediction using K Nearest Neighbors (KNN)

In [1]:
# import all the required libraries
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim

In [2]:
# obtain the raw csv 
data = "../assets/data/mumbai_house_prices.csv"
house_price = pd.read_csv(data)

In [3]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         76038 non-null  int64  
 1   type        76038 non-null  object 
 2   locality    76038 non-null  object 
 3   area        76038 non-null  int64  
 4   price       76038 non-null  float64
 5   price_unit  76038 non-null  object 
 6   region      76038 non-null  object 
 7   status      76038 non-null  object 
 8   age         76038 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.2+ MB
None


In [4]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [5]:
# create a dictionary to map region to their latitude and longitude
geo = Nominatim(user_agent="Geopy Library", timeout=10)  # Adjust timeout value if error is raised
unique_regions = house_price["region"].unique()
print("Total number of unique values: ", len(unique_regions))
lat_long_dict = {}
unknown_regions = []

for r in unique_regions:
    loc = geo.geocode(r + ", Mumbai")
    if (loc == None):
        unknown_regions.append(r)
        continue
    else:
        latitude = loc.latitude
        longitude = loc.longitude
        lat_long_dict[r] = [latitude, longitude]

print("Geopy could not find the following regions: ", unknown_regions, len(unknown_regions))

# This cell might take about 2 minutes to execute as it is geocoding each region

Total number of unique values:  228
Geopy could not find the following regions:  ['Mira Road East', 'Badlapur East', 'Badlapur West', 'Ambernath West', 'Ulhasnagar', 'Kewale', 'Nala Sopara', 'Karanjade', 'Neral', 'Karjat', 'Dronagiri', 'Navade', 'Owale', 'Ville Parle East', 'Vangani', 'Bhayandar East', 'Ambernath East', 'Nilje Gaon', 'Titwala', 'Koper Khairane', 'Napeansea Road', 'Koproli', 'Anjurdive', 'Taloje', 'Vasai West', 'Vasai east', 'Nalasopara East', 'Saphale', 'Kasheli', 'Panch Pakhdi', 'Hiranandani Estates', 'Vichumbe', 'Sector 17 Ulwe', 'Sector 23 Ulwe', 'Sector 20 Kamothe', 'Sector 30 Kharghar', 'Virar East', 'Sector 8 New panvel', 'Bhayandar West', 'Sector 20 Ulwe', 'Virar West', 'Palava', 'Greater Khanda', 'Sector-35D Kharghar', 'Umroli', 'Sector-9 Ulwe', 'Sector-3 Ulwe', 'kasaradavali thane west', 'Sector 19 Kharghar', 'Kalher', 'Sector 21 Kharghar', 'Usarghar Gaon', 'Patlipada', 'Vevoor', 'Sector 7 Kharghar', 'Badlapur', 'Khanda Colony', 'Gauripada', 'Warai', 'Khatiwal

In [6]:
# we remove all rows which contain unidentified regions and for all rows with identified regions, we add their latitiude and longitude to the data set.
del_idx = []
for i in range(len(house_price)):
    region = house_price.loc[i, "region"]
    if region in unknown_regions:
        del_idx.append(i)
    else:
        lat_long = lat_long_dict[region]
        house_price.at[i, "latitude"] = lat_long[0] 
        house_price.at[i, "longitude"] = lat_long[1]


In [7]:
house_price.drop(del_idx, inplace=True)
house_price = house_price.reset_index(drop=True)

In [8]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age,latitude,longitude
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New,19.117249,72.833968
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New,19.013755,72.846294
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New,19.229456,72.84799
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New,18.990978,73.065553
4,2,Apartment,Bhoomi Simana Wing A Phase 1,826,3.3,Cr,Parel,Under Construction,New,19.009482,72.837661


In [9]:
house_price.shape

(61217, 11)

In [10]:
# remove columns type, locality and region since they are not meaningful for distance metric
house_price.drop(['locality', 'region'], axis=1, inplace=True)

In [11]:
# print all unique values of categorical columns
print(house_price.type.unique())
print(house_price.age.unique())
print(house_price.status.unique())

['Apartment' 'Villa' 'Independent House' 'Studio Apartment' 'Penthouse']
['New' 'Resale' 'Unknown']
['Ready to move' 'Under Construction']


In [12]:
# dealing with categorical data
house_price["type"].replace({"Studio Apartment":0, "Apartment":0.2, "Independent House":0.4, "Villa": 0.6, "Penthouse": 1}, inplace=True)
house_price["age"].replace({"New":0, "Resale":1, "Unknown": 0.5}, inplace=True)
house_price["status"].replace({"Ready to move":0, "Under Construction":1}, inplace=True)

In [13]:
# calculating price in Crores for each using price and price_unit
for i in range(len(house_price)):  
    price_unit = house_price.loc[i, "price_unit"]

    if price_unit == "L":
        house_price.at[i, "price"] = (house_price.at[i, "price"] / 100)
  
house_price.drop(["price_unit"], axis=1, inplace=True)

In [14]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.2,685,2.5,0,0.0,19.117249,72.833968
1,2,0.2,640,0.5251,1,0.0,19.013755,72.846294
2,2,0.2,610,1.73,1,0.0,19.229456,72.84799
3,2,0.2,876,0.5998,1,0.0,18.990978,73.065553
4,2,0.2,826,3.3,1,0.0,19.009482,72.837661


In [15]:
# split into train and test dataset
test_ratio = 0.1
test_size = int(test_ratio*len(house_price))
test_indices = house_price.sample(test_size).index
X_train = house_price.drop(test_indices)
X_test = house_price.loc[test_indices]
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = X_train.pop("price").tolist()
y_test = X_test.pop("price").tolist()
print("The size of X_train is: ", X_train.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_test is: ", len(y_test))
print(X_train.head())
print(y_train[0:5])
print(X_test.head())
print(y_test[0:5])

The size of X_train is:  (55096, 7)
The size of X_test is:  (6121, 7)
The size of y_train is:  55096
The size of y_test is:  6121
   bhk  type  area  status  age   latitude  longitude
0    3   0.2   685       0  0.0  19.117249  72.833968
1    2   0.2   640       1  0.0  19.013755  72.846294
2    2   0.2   610       1  0.0  19.229456  72.847990
3    2   0.2   876       1  0.0  18.990978  73.065553
4    2   0.2   826       1  0.0  19.009482  72.837661
[2.5, 0.5251, 1.73, 0.5998, 3.3]
   bhk  type  area  status  age   latitude  longitude
0    3   0.2  1480       0  0.5  19.026011  73.010167
1    3   0.2  1640       0  1.0  19.021368  73.018939
2    2   0.2   556       1  0.0  19.118720  72.907348
3    1   0.2   569       1  0.5  19.026011  73.010167
4    2   0.2  1100       1  0.0  19.046521  72.863283
[1.8, 3.0, 1.53, 0.66, 1.42]


In [16]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

        bhk      type      area    status       age  latitude  longitude
0  0.934887  0.015356 -0.539377 -0.831511 -0.984436  0.133046  -1.290505
1 -0.123500  0.015356 -0.601767  1.202630 -0.984436 -0.990866  -1.159361
2 -0.123500  0.015356 -0.643360  1.202630 -0.984436  1.351573  -1.141314
3 -0.123500  0.015356 -0.274566  1.202630 -0.984436 -1.238220   1.173420
4 -0.123500  0.015356 -0.343888  1.202630 -0.984436 -1.037277  -1.251210


In [22]:
# Set hyperparameter k
K = 4
MEA = 0

# Testing
for i in range(X_test.shape[0]):
    # Calculate distances for all instances in X_train and instance i from X_test
    distances = np.sum(np.abs(X_train - X_test.loc[i]), axis=1)
    
    # Get indices of k nearest neighbors
    nearest_indices = np.argsort(distances)[:K]
    
    # Calculate weighted average price of k nearest neighbors using inverse distances as weights
    weights = 1 / (distances[nearest_indices] + 1e-6) 
    
    test_price = np.sum(np.array(y_train)[nearest_indices] * weights) / np.sum(weights)

    MEA += abs(test_price - y_test[i])

    if (i % 1000 == 0):
        print(f'Iteration {i} Completed')

# Calculate Mean Absolute Error
MEA /= X_test.shape[0]
print("Mean Absolute Error is:", MEA)


Iteration 0 Completed
Iteration 1000 Completed
Iteration 2000 Completed
Iteration 3000 Completed
Iteration 4000 Completed
Iteration 5000 Completed
Iteration 6000 Completed
Mean Absolute Error is: 0.2584603324219878


In [20]:
import numpy as np

def test_instance(instance, k, X_train, y_train):
    epsilon = 1e-6
    distances = np.sum(np.abs(X_train - instance), axis=1)
    
    # Get indices of k nearest neighbors
    nearest_indices = np.argsort(distances)[:k]
    
    # Calculate inverse distances as weights
    weights = 1 / (distances[nearest_indices] + epsilon) 
    
    # Calculate weighted average of labels of k nearest neighbors
    test_price = np.sum(np.array(y_train)[nearest_indices] * weights) / np.sum(weights)
    
    return test_price

In [21]:
# Testing with our own data
X_test.loc[0, "bhk"] = 1
X_test.loc[0, "type"] = 0.2
X_test.loc[0, "area"] = 460
X_test.loc[0, "age"] = 0
X_test.loc[0, "status"] = 0
X_test.loc[0, "latitude"] = 19.18226
X_test.loc[0, "longitude"] = 72.9512

# Normalizing the sample
X_test.loc[0] = (X_test.loc[0] - X_mean) / X_std

test_price = test_instance(X_test.loc[0], K, X_train, y_train)
print("The price of house in Cr: ", round(test_price, 3))

The price of house in Cr:  1.055
