## Mumbai House Price Prediction using K Nearest Neighbors (KNN)

In [1]:
# Import all the required libraries
import numpy as np
import pandas as pd

In [2]:
# Obtain the modified csv 
# Refer assets/scripts/house-price-dataset.py for preprocessing steps
data = "../assets/data/modified_mumbai_house_prices.csv"
house_price = pd.read_csv(data)

In [3]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   bhk        76038 non-null  int64  
 1   type       76038 non-null  float64
 2   area       76038 non-null  int64  
 3   price      76038 non-null  float64
 4   status     76038 non-null  int64  
 5   age        76038 non-null  float64
 6   latitude   76038 non-null  float64
 7   longitude  76038 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 4.6 MB
None


In [4]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.25,685,2.5,1,1.0,19.112122,72.867676
1,2,0.25,640,0.5251,0,1.0,18.969048,72.821182
2,2,0.25,610,1.73,0,1.0,18.563005,73.906578
3,2,0.25,876,0.5998,0,1.0,18.999653,73.126328
4,2,0.25,659,0.9411,0,1.0,18.969048,72.821182


In [5]:
house_price.shape

(76038, 8)

In [6]:
# split into train and test dataset
test_ratio = 0.05
test_size = int(test_ratio*len(house_price))
test_indices = house_price.sample(test_size, random_state=42).index
X_train = house_price.drop(test_indices)
X_test = house_price.loc[test_indices]
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = X_train.pop("price").tolist()
y_test = X_test.pop("price").tolist()
print("The size of X_train is: ", X_train.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_test is: ", len(y_test))
print(X_train.head())
print(y_train[0:5])
print(X_test.head())
print(y_test[0:5])

The size of X_train is:  (72237, 7)
The size of X_test is:  (3801, 7)
The size of y_train is:  72237
The size of y_test is:  3801
   bhk  type  area  status  age   latitude  longitude
0    3  0.25   685       1  1.0  19.112122  72.867676
1    2  0.25   640       0  1.0  18.969048  72.821182
2    2  0.25   610       0  1.0  18.563005  73.906578
3    2  0.25   876       0  1.0  18.999653  73.126328
4    2  0.25   659       0  1.0  18.969048  72.821182
[2.5, 0.5251, 1.73, 0.5998, 0.9411]
   bhk  type  area  status  age   latitude  longitude
0    1  0.25   650       0  1.0  19.064692  73.129295
1    3  0.25  1800       1  0.0  19.110324  73.006050
2    1  0.25   650       0  1.0  18.969048  72.821182
3    1  0.25   916       1  1.0  18.969048  72.821182
4    2  0.25   811       1  1.0  18.969048  72.821182
[0.41, 2.8, 0.27, 0.537, 1.4]


In [7]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

        bhk      type      area    status       age  latitude  longitude
0  1.068390  0.028729 -0.506153  0.830872  0.919331  0.475835  -0.132442
1 -0.016673  0.028729 -0.573139 -1.203556  0.919331 -0.575141  -0.439650
2 -0.016673  0.028729 -0.617797 -1.203556  0.919331 -3.557795   6.732132
3 -0.016673  0.028729 -0.221833 -1.203556  0.919331 -0.350324   1.576606
4 -0.016673  0.028729 -0.544856 -1.203556  0.919331 -0.575141  -0.439650


In [8]:
# Set hyperparameter k
K = 17
MEA = 0

# Testing
for i in range(X_test.shape[0]):
    # Calculate distances for all instances in X_train and instance i from X_test
    distances = np.sum(np.abs(X_train - X_test.loc[i]), axis=1)
    
    # Get indices of k nearest neighbors
    nearest_indices = np.argsort(distances)[:K]
    
    # Calculate weighted average price of k nearest neighbors using inverse distances as weights
    weights = 1 / (distances[nearest_indices] + 1e-6) 
    
    test_price = np.sum(np.array(y_train)[nearest_indices] * weights) / np.sum(weights)

    MEA += abs(test_price - y_test[i])

    if (i % 1000 == 0):
        print(f'Test instance {i}/{X_test.shape[0]} completed')

# Calculate Mean Absolute Error
MEA /= X_test.shape[0]
print("Mean Absolute Error in Cr is:", MEA)

Test instance 0/3801 completed
Test instance 1000/3801 completed
Test instance 2000/3801 completed
Test instance 3000/3801 completed
Mean Absolute Error in Cr is: 0.30653924057122495


In [9]:
def test_instance(instance, k, X_train, y_train):
    epsilon = 1e-6
    distances = np.sum(np.abs(X_train - instance), axis=1)
    
    # Get indices of k nearest neighbors
    nearest_indices = np.argsort(distances)[:k]
    
    # Calculate inverse distances as weights
    weights = 1 / (distances[nearest_indices] + epsilon) 
    
    # Calculate weighted average of labels of k nearest neighbors
    test_price = np.sum(np.array(y_train)[nearest_indices] * weights) / np.sum(weights)
    
    return test_price

In [10]:
# Testing with our own data
X_test.loc[0, "bhk"] = 4
X_test.loc[0, "type"] = 0.2
X_test.loc[0, "area"] = 1620
X_test.loc[0, "age"] = 1
X_test.loc[0, "status"] = 1
X_test.loc[0, "latitude"] = 19.1386
X_test.loc[0, "longitude"] = 72.8429

# Normalizing the sample
X_test.loc[0] = (X_test.loc[0] - X_mean) / X_std

test_price = test_instance(X_test.loc[0], K, X_train, y_train)
print("The price of house in Cr: ", round(test_price, 3))

The price of house in Cr:  6.238


In [11]:
# Verifying implementation using scikit learn
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the KNN model
knn_model = KNeighborsRegressor(n_neighbors=K, weights='distance')
knn_model.fit(X_train.values, y_train)
y_pred_knn = knn_model.predict(X_test.values)

# Evaluate the KNN model
mae_knn = mean_absolute_error(y_test, y_pred_knn)

print("Mean Absolute Error using KNN:", mae_knn)


Mean Absolute Error using KNN: 0.3111747897894902
