In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [2]:
# Data Preprocessing

# Changing all yes/no data to 0/1, normalization of price and area, and outlier removal 

for column in df:  
    df[column].replace(('yes','no'), (0,1), inplace = True)
    if(column == 'price' or column == 'area'):
        df[column] = (df[column] - df[column].mean()) / df[column].std()
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df.drop(df[(df[column] < Q1-1.5*IQR) | (df[column] > Q3+1.5*IQR)].index, inplace = True) 
        
# Adding One Hot Encoding to furnishingstatus 

df = pd.get_dummies(df, columns = ['furnishingstatus']) 

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
15,2.316712,0.452182,4,1,2,0,1,0,1,1,2,1,0,1,0
16,2.316712,0.741276,4,2,2,0,0,0,1,0,1,0,0,0,1
17,2.241864,1.65674,3,2,4,0,1,1,1,0,2,1,1,0,0
18,2.204439,-0.222371,3,2,2,0,0,1,1,0,2,1,1,0,0
19,2.185727,0.654548,3,2,2,0,1,1,1,0,1,0,0,1,0


In [3]:
X = df.iloc[0:, 1:]
Y = df.iloc[0:, 0:1]
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size = 0.7, random_state = 42) 

In [4]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(x_train, y_train) 
predicted = knn.predict(x_test)

In [5]:
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(predicted, y_test))

0.32659460804614765
