In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [6]:
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
val_data = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)


In [11]:
def data_prep(df, features, output):
    x = df[features].as_matrix()
    b = np.ones(shape=(len(x),1), dtype=np.float32)
    x = np.concatenate((b,x), axis=1)
    y = np.array(df[output])
    return (x,y)

def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)


In [21]:
features_train, train_y = data_prep(df = train_data, features=[f for f in train_data.columns if f not in ['price', 'id', 'date', 'zipcode']], output='price')
features_test, test_y = data_prep(df = test_data, features=[f for f in train_data.columns if f not in ['price', 'id', 'date', 'zipcode']], output='price')
features_val, val_y = data_prep(df = val_data, features=[f for f in train_data.columns if f not in ['price', 'id', 'date', 'zipcode']], output='price')

features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_val = features_val / norms

In [12]:
features=[f for f in train_data.columns if f != 'price']

In [16]:
len(dtype_dict)

21

In [17]:
features=[f for f in train_data.columns if f not in ['price', 'id', 'date', 'zipcode']]

In [19]:
len(features)

17

In [44]:
class K_NN_R:
    def __init__(self, train_x, train_y, val_x, val_y, test_x, test_y, k=1):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y
        self.val_x = val_x
        self.val_y = val_y
        self.k = k
     
    def predictions(self, features):
        N = features.shape[0]
        results = np.zeros(N)
        for i in range(N):
            p = 0.0
            nbr = self.knn(features[i,:])
            for n in nbr:
                p +=self.train_y[n]
            result = p/self.k
            results[i] = result
        return results                
        
    def knn(self, x):
        diff = self.train_x - x
        dist = np.sqrt(np.sum(diff**2, axis=1))
        sorted_dist_idx = np.argsort(dist)
        return list(sorted_dist_idx[:self.k])
        
        
    def val_rss(self):
        val_preds = self.predictions(features=self.val_x)
        rss = np.sum((self.val_y - val_preds)**2)
        return rss
    
    def test_rss(self):
        test_preds = self.predictions(features=self.test_x)
        rss = np.sum((self.test_y - test_preds)**2)
        return rss
    
    def predict(self, x):
        p = 0.0
        nbr = self.knn(x)
        for n in nbr:
            p +=self.train_y[n]
        result = p/self.k
        return result
        
        

In [24]:
diff = features_train[9,:] - features_test[0,:]
dist = np.sqrt(np.sum(diff**2))

In [25]:
dist

0.059723593713980783

In [28]:
diff = features_train[:10,:] - features_test[0,:]
dist = np.sqrt(np.sum(diff**2, axis=1))
sorted_dist_idx = np.argsort(dist)       

In [29]:
sorted_dist_idx

array([8, 3, 6, 7, 4, 9, 5, 0, 2, 1], dtype=int64)

In [30]:
diff = features_train - features_test[2,:]
dist = np.sqrt(np.sum(diff**2, axis=1))
sorted_dist_idx = np.argsort(dist)       

In [34]:
sorted_dist_idx[:4]

array([ 382, 1149, 4087, 3142], dtype=int64)

In [33]:
train_y[382]

249000.0

In [35]:
(train_y[382] + train_y[1149] + train_y[4087] +train_y[3142])/ 4

413987.5

In [36]:
model_k10 = K_NN_R(train_x=features_train, train_y=train_y, val_x=features_val, val_y=val_y, test_x=features_test, test_y=test_y, k=10)

In [37]:
results = model_k10.predictions(features_test[:10,:])

In [38]:
results

array([ 881300. ,  431860. ,  460595. ,  430200. ,  766750. ,  667420. ,
        350032. ,  512800.7,  484000. ,  457235. ])

In [39]:
np.argsort(results)

array([6, 3, 1, 9, 2, 8, 7, 5, 4, 0], dtype=int64)

In [42]:
K = range(1,16)

In [45]:
summary = dict()
for k in K:
    model = K_NN_R(train_x=features_train, train_y=train_y, val_x=features_val, val_y=val_y, test_x=features_test, test_y=test_y, k=k)
    rss = model.val_rss()
    summary[k] = rss
    

{1: 105453830251561.0,
 2: 83445073504025.5,
 3: 72692096019202.563,
 4: 71946721652091.688,
 5: 69846517419718.602,
 6: 68899544353180.836,
 7: 68341973450051.094,
 8: 67361678735491.5,
 9: 68372727958976.094,
 10: 69335048668556.742,
 11: 69523855215598.828,
 12: 69049969587246.172,
 13: 70011254508263.688,
 14: 70908698869034.344,
 15: 71106928385945.156}

In [47]:
sorted_summary = sorted(summary, key=summary.get)

In [48]:
sorted_summary

[8, 7, 9, 6, 12, 10, 11, 5, 13, 14, 15, 4, 3, 2, 1]

In [49]:
model = K_NN_R(train_x=features_train, train_y=train_y, val_x=features_val, val_y=val_y, test_x=features_test, test_y=test_y, k=8)

In [50]:
test_rss = model.test_rss()

In [51]:
test_rss

133118823551516.81