# Predicting House Prices Using k-nearest neighbors Regression

In [1]:
import graphlab

In [None]:
sales = graphlab.SFrame('kc_house_data_small.gl/')

In [3]:
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900,3.0,1.0,1180.0,5650,1.0,0
6414100192,2014-12-09 00:00:00+00:00,538000,3.0,2.25,2570.0,7242,2.0,0
5631500400,2015-02-25 00:00:00+00:00,180000,2.0,1.0,770.0,10000,1.0,0
2487200875,2014-12-09 00:00:00+00:00,604000,4.0,3.0,1960.0,5000,1.0,0
1954400510,2015-02-18 00:00:00+00:00,510000,3.0,2.0,1680.0,8080,1.0,0
2008000270,2015-01-15 00:00:00+00:00,291850,3.0,1.5,1060.0,9711,1.0,0
2414600126,2015-04-15 00:00:00+00:00,229500,3.0,1.0,1780.0,7470,1.0,0
1736800520,2015-04-03 00:00:00+00:00,662500,3.0,2.5,3560.0,9796,1.0,0
9297300055,2015-01-24 00:00:00+00:00,650000,4.0,3.0,2950.0,5000,2.0,0
6865200140,2014-05-29 00:00:00+00:00,485000,4.0,1.0,1600.0,4300,1.5,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,8,1860,1700,1965,0,98007,47.60065993
3,3,9,1980,970,1979,0,98126,47.57136955
0,4,7,1600,0,1916,0,98103,47.66478645

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.14529566,2210.0,8925.0
-122.37541218,2140.0,4000.0
-122.34281613,1610.0,4300.0


In [4]:
import numpy as np

In [5]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe=data_sframe[features]
    features_matrix = features_sframe.to_numpy()
    output_sarray=data_sframe[output]
    output_array = output_sarray.to_numpy() 
    return(features_matrix, output_array)

In [6]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features=feature_matrix/norms
    return(normalized_features,norms)

In [7]:
(train_and_validation, test) = sales.random_split(.8, seed=1) # initial train/test split
(train, validation) = train_and_validation.random_split(.8, seed=1) # split training set into training and validation sets

In [8]:
len(validation)

1435

##Considering the following features:-

In [9]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

In [10]:
features_train, norms = normalize_features(features_train) # normalize training set features (columns)
features_test = features_test / norms # normalize test set by training set norms
features_valid = features_valid / norms # normalize validation set by training set norms

##Performing k-nearest neighbor regression:-

In [22]:
def knn(k,features_train,features_test):
    diff=features_train[0:len(features_train)]-features_test
    distances=np.sqrt(np.sum(diff**2, axis=1))
    arrayi=np.argsort(distances)
    return arrayi[0:k]

##Indices of the 4 training houses closest to the query house i.e., third house of the test set(`features_test[2]`):-

In [28]:
arrayi=knn(4,features_train,features_test[2])
print arrayi

[ 382 1149 4087 3142]


##Making a prediction by averaging k nearest neighbor outputs:-

In [29]:
def predicting(k,feat1,output_train,arrayi):
    means=output_train[arrayi].mean()
    return means

So, for query house(third house of the test set (`features_test[2]`)), predicted house value using k-nearest neighbors with `k=4`:-

In [30]:
arrayi=knn(4,features_train,features_test[2])
means=predicting(4,features_train,output_train,arrayi)
print means

413987.5
