## Mumbai House Price Prediction using K Nearest Neighbors (KNN)

In [1]:
# import all the required libraries
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim

In [2]:
# obtain the raw csv 
data = "../assets/data/mumbai_house_prices.csv"
house_price = pd.read_csv(data)

In [3]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         76038 non-null  int64  
 1   type        76038 non-null  object 
 2   locality    76038 non-null  object 
 3   area        76038 non-null  int64  
 4   price       76038 non-null  float64
 5   price_unit  76038 non-null  object 
 6   region      76038 non-null  object 
 7   status      76038 non-null  object 
 8   age         76038 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.2+ MB
None


In [4]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [5]:
# create a dictionary to map region to their latitude and longitude
geo = Nominatim(user_agent="Geopy Library", timeout=10)  # Adjust timeout value if error is raised
unique_regions = house_price["region"].unique()
print("Total number of unique values: ", len(unique_regions))
lat_long_dict = {}
unknown_regions = []

for r in unique_regions:
    loc = geo.geocode(r + ", Mumbai")
    if (loc == None):
        unknown_regions.append(r)
        continue
    else:
        latitude = loc.latitude
        longitude = loc.longitude
        lat_long_dict[r] = [latitude, longitude]

print("Geopy could not find the following regions: ", unknown_regions, len(unknown_regions))

Total number of unique values:  228
Geopy could not find the following regions:  ['Mira Road East', 'Badlapur East', 'Badlapur West', 'Ambernath West', 'Ulhasnagar', 'Kewale', 'Nala Sopara', 'Karanjade', 'Neral', 'Karjat', 'Dronagiri', 'Navade', 'Owale', 'Ville Parle East', 'Vangani', 'Bhayandar East', 'Ambernath East', 'Nilje Gaon', 'Titwala', 'Koper Khairane', 'Napeansea Road', 'Koproli', 'Anjurdive', 'Taloje', 'Vasai West', 'Vasai east', 'Nalasopara East', 'Saphale', 'Kasheli', 'Panch Pakhdi', 'Hiranandani Estates', 'Vichumbe', 'Sector 17 Ulwe', 'Sector 23 Ulwe', 'Sector 20 Kamothe', 'Sector 30 Kharghar', 'Virar East', 'Sector 8 New panvel', 'Bhayandar West', 'Sector 20 Ulwe', 'Virar West', 'Palava', 'Greater Khanda', 'Sector-35D Kharghar', 'Umroli', 'Sector-9 Ulwe', 'Sector-3 Ulwe', 'kasaradavali thane west', 'Sector 19 Kharghar', 'Kalher', 'Sector 21 Kharghar', 'Usarghar Gaon', 'Patlipada', 'Vevoor', 'Sector 7 Kharghar', 'Badlapur', 'Khanda Colony', 'Gauripada', 'Warai', 'Khatiwal

In [6]:
# we remove all rows which contain unidentified regions and for all rows with identified regions, we add their latitiude and longitude to the data set.
del_idx = []
for i in range(len(house_price)):
    region = house_price.loc[i, "region"]
    if region in unknown_regions:
        del_idx.append(i)
    else:
        lat_long = lat_long_dict[region]
        house_price.at[i, "latitude"] = lat_long[0] 
        house_price.at[i, "longitude"] = lat_long[1]


In [7]:
house_price.drop(del_idx, inplace=True)
house_price = house_price.reset_index(drop=True)

In [8]:
house_price.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age,latitude,longitude
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New,19.117249,72.833968
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New,19.013755,72.846294
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New,19.229456,72.84799
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New,18.990978,73.065553
4,2,Apartment,Bhoomi Simana Wing A Phase 1,826,3.3,Cr,Parel,Under Construction,New,19.009482,72.837661


In [9]:
house_price.shape

(61217, 11)

In [10]:
# remove columns type, locality and region since they are not meaningful for distance metric
house_price.drop(['locality', 'region'], axis=1, inplace=True)

In [11]:
# print all unique values of categorical columns
print(house_price.type.unique())
print(house_price.age.unique())
print(house_price.status.unique())

['Apartment' 'Villa' 'Independent House' 'Studio Apartment' 'Penthouse']
['New' 'Resale' 'Unknown']
['Ready to move' 'Under Construction']


In [12]:
# dealing with categorical data
house_price["type"].replace({"Studio Apartment":0, "Apartment":0.2, "Independent House":0.4, "Villa": 0.6, "Penthouse": 1}, inplace=True)
house_price["age"].replace({"New":0, "Resale":1, "Unknown": 0.5}, inplace=True)
house_price["status"].replace({"Ready to move":0, "Under Construction":1}, inplace=True)

In [13]:
# calculating price in Crores for each using price and price_unit
for i in range(len(house_price)):  
    price_unit = house_price.loc[i, "price_unit"]

    if price_unit == "L":
        house_price.at[i, "price"] = (house_price.at[i, "price"] / 100)
  
house_price.drop(["price_unit"], axis=1, inplace=True)

In [14]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.2,685,2.5,0,0.0,19.117249,72.833968
1,2,0.2,640,0.5251,1,0.0,19.013755,72.846294
2,2,0.2,610,1.73,1,0.0,19.229456,72.84799
3,2,0.2,876,0.5998,1,0.0,18.990978,73.065553
4,2,0.2,826,3.3,1,0.0,19.009482,72.837661


In [15]:
# split into train and test dataset
test_ratio = 0.1
test_size = int(test_ratio*len(house_price))
test_indices = house_price.sample(test_size).index
X_train = house_price.drop(test_indices)
X_test = house_price.loc[test_indices]
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = X_train.pop("price").tolist()
y_test = X_test.pop("price").tolist()
print("The size of X_train is: ", X_train.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_test is: ", len(y_test))
print(X_train.head())
print(y_train[0:5])
print(X_test.head())
print(y_test[0:5])

The size of X_train is:  (55096, 7)
The size of X_test is:  (6121, 7)
The size of y_train is:  55096
The size of y_test is:  6121
   bhk  type  area  status  age   latitude  longitude
0    3   0.2   685       0  0.0  19.117249  72.833968
1    2   0.2   640       1  0.0  19.013755  72.846294
2    2   0.2   610       1  0.0  19.229456  72.847990
3    2   0.2   826       1  0.0  19.009482  72.837661
4    5   0.6  2921       1  0.0  19.205903  72.864658
[2.5, 0.5251, 1.73, 3.3, 1.99]
   bhk  type  area  status  age   latitude  longitude
0    1   0.2   505       1  0.0  19.193022  73.104343
1    2   0.2  1085       0  1.0  19.142934  72.855207
2    3   0.2  1775       0  1.0  19.119331  72.999510
3    1   0.2   740       1  0.0  19.025773  73.059185
4    3   0.2  1701       0  1.0  19.026919  72.875934
[0.33, 2.1, 2.25, 0.65, 3.5]


In [16]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

        bhk       type      area    status       age  latitude  longitude
0  0.936505   0.019087 -0.540581 -0.830918 -0.983107  0.144198  -1.296173
1 -0.122996   0.019087 -0.603318  1.203489 -0.983107 -0.998068  -1.165123
2 -0.122996   0.019087 -0.645143  1.203489 -0.983107  1.382625  -1.147089
3 -0.122996   0.019087 -0.344003  1.203489 -0.983107 -1.045237  -1.256906
4  3.055506  13.676246  2.576778  1.203489 -0.983107  1.122673  -0.969884


In [17]:
def test_instance(instance, k, train, label):
    train_copy = train.copy()

    for i in range(len(train_copy)):
        train_copy.at[i, "distance"] = np.sum(np.absolute(train_copy.iloc[i] - instance))
        train_copy.at[i, "index"] = int(i)

    sorted_train = train_copy.sort_values(by="distance")

    total_price = 0
    for i in range(k):
        total_price+=label[int(sorted_train.iloc[i]["index"])]
    
    return total_price/k

In [18]:
# set hyperparameter k
K = 10
num_instance = 15
MEA = 0
# testing only for first 15 instances since each inference requires around 30 seconds to one minute to run
for i in range(num_instance):
    test_price = test_instance(X_test.loc[i], K, X_train, y_train)
    loss = abs(test_price - y_test[i])
    print("Loss at instance", i, "is", loss)
    MEA += loss

MEA /= num_instance
print("Mean Absolute Error is:", MEA)

Loss at instance 0 is 0.12037999999999999
Loss at instance 1 is 0.0990000000000002
Loss at instance 2 is 0.2530000000000001
Loss at instance 3 is 0.06610000000000005
Loss at instance 4 is 0.5150000000000006
Loss at instance 5 is 0.03900000000000037
Loss at instance 6 is 0.10865999999999992
Loss at instance 7 is 0.0907800000000002
Loss at instance 8 is 0.10999999999999999
Loss at instance 9 is 0.48699999999999966
Loss at instance 10 is 0.40000000000000036
Loss at instance 11 is 0.3490000000000002
Loss at instance 12 is 0.06231000000000003
Loss at instance 13 is 0.383
Loss at instance 14 is 0.14783000000000002
Mean Absolute Error is: 0.2154040000000001


In [19]:
# Testing with our own data
X_test.loc[0, "bhk"] = 2
X_test.loc[0, "type"] = 0.2
X_test.loc[0, "area"] = 720
X_test.loc[0, "age"] = 1
X_test.loc[0, "status"] = 0
X_test.loc[0, "latitude"] = 191746.73
X_test.loc[0, "longitude"] = 729431.08

# Normalizing the sample
X_test.loc[0] = (X_test.loc[0] - X_mean) / X_std

test_price = test_instance(X_test.loc[0], K, X_train, y_train)
print("The price of house in Cr: ", round(test_price, 3))

The price of house in Cr:  0.708
