In [58]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

In [59]:
housing = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
                     header=None, sep='\s+')
housing.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PRATIO", \
                   "B", "LSTAT", "MDEV"]
housing.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,B,LSTAT,MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [60]:
len(housing)

506

In [61]:
mask = np.random.rand(len(housing)) < 0.8
training = housing[mask]
testing = housing[~mask]
len(training)

417

In [62]:
len(testing)

89

In [63]:
nbrs = NearestNeighbors().fit(housing)

In [64]:
distances, indices = nbrs.kneighbors(housing)
indices                                           

array([[  0, 241,  62,  81,   6],
       [  1,  47,  49,  87,   2],
       [  2,  85,  87,  84,   5],
       ..., 
       [503, 504, 219,  88, 217],
       [504, 503, 219,  88, 217],
       [505, 502, 504, 503,  91]], dtype=int32)

In [65]:
distances

array([[  0.        ,  16.5628085 ,  17.09498324,  18.40127391,
         19.10555821],
       [  0.        ,  16.18433277,  20.59837827,  22.95753545,
         23.05885288],
       [  0.        ,  11.44014392,  15.34074743,  19.2322435 ,
         21.73264817],
       ..., 
       [  0.        ,   4.38093898,   9.44318468,  10.79865973,
         11.95458848],
       [  0.        ,   4.38093898,   8.88725757,  10.88003717,
         11.15236419],
       [  0.        ,   9.69512304,  13.73766871,  15.93946676,
         15.94577477]])

In [66]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
x_columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PRATIO", "B", "LSTAT"]
y_column = ["MDEV"]
knn.fit(training[x_columns], training[y_column])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [67]:
predictions = knn.predict(testing[x_columns])
predictions

array([[ 20.62],
       [ 21.18],
       [ 23.96],
       [ 17.14],
       [ 17.24],
       [ 18.68],
       [ 28.88],
       [ 37.54],
       [ 26.68],
       [ 39.02],
       [ 25.96],
       [ 21.9 ],
       [ 21.9 ],
       [ 25.42],
       [ 23.7 ],
       [ 25.54],
       [ 20.84],
       [ 19.28],
       [ 18.04],
       [ 18.72],
       [ 15.5 ],
       [ 18.26],
       [ 20.86],
       [ 37.22],
       [ 32.8 ],
       [ 24.66],
       [ 24.84],
       [ 28.86],
       [ 37.92],
       [ 27.58],
       [ 27.58],
       [ 29.  ],
       [ 27.28],
       [ 22.52],
       [ 23.86],
       [ 23.58],
       [ 28.46],
       [ 23.86],
       [ 27.82],
       [ 21.96],
       [ 19.68],
       [ 31.26],
       [ 42.88],
       [ 42.88],
       [ 36.16],
       [ 33.28],
       [ 32.76],
       [ 29.5 ],
       [ 29.86],
       [ 34.82],
       [ 41.6 ],
       [ 29.96],
       [ 20.18],
       [ 22.64],
       [ 22.38],
       [ 20.66],
       [ 26.08],
       [ 19.58],
       [ 26.1 

In [68]:
columns = ["testing","prediction","diff"]
index = range(len(testing))
results = pd.DataFrame(index=index, columns=columns)

results['prediction'] = predictions

results = results.reset_index(drop=True)
testing = testing.reset_index(drop=True)
results['testing'] = testing["MDEV"]

results['diff'] = results['testing'] - results['prediction']
results['pct'] = results['diff'] / results['testing']
results.mean()

testing       22.159551
prediction    22.931011
diff          -0.771461
pct           -0.099104
dtype: float64