In [1]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

In [2]:
housing = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
                     header=None, sep='\s+')
housing.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PRATIO", \
                   "B", "LSTAT", "MDEV"]
housing.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,B,LSTAT,MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
len(housing)

506

In [4]:
mask = np.random.rand(len(housing)) < 0.8
training = housing[mask]
testing = housing[~mask]
len(training)

421

In [5]:
len(testing)

85

In [6]:
nbrs = NearestNeighbors().fit(housing)

In [7]:
distances, indices = nbrs.kneighbors(housing)
indices                                           

array([[  0, 241,  62,  81,   6],
       [  1,  47,  49,  87,   2],
       [  2,  85,  87,  84,   5],
       ..., 
       [503, 504, 219,  88, 217],
       [504, 503, 219,  88, 217],
       [505, 502, 504, 503,  91]], dtype=int64)

In [9]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
x_columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PRATIO", "B", "LSTAT"]
y_column = ["MDEV"]
knn.fit(training[x_columns], training[y_column])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [10]:
predictions = knn.predict(testing[x_columns])
predictions

array([[ 21.7 ],
       [ 20.88],
       [ 17.82],
       [ 21.46],
       [ 24.06],
       [ 24.08],
       [ 20.7 ],
       [ 17.84],
       [ 14.86],
       [ 17.14],
       [ 23.28],
       [ 39.78],
       [ 26.34],
       [ 21.96],
       [ 24.66],
       [ 21.26],
       [ 30.94],
       [ 22.04],
       [ 26.02],
       [ 23.42],
       [ 23.62],
       [ 23.68],
       [ 19.62],
       [ 18.7 ],
       [ 19.64],
       [ 18.7 ],
       [ 24.24],
       [ 24.24],
       [ 20.32],
       [ 20.32],
       [ 42.92],
       [ 17.88],
       [ 24.58],
       [ 33.14],
       [ 21.54],
       [ 20.18],
       [ 30.36],
       [ 24.66],
       [ 26.34],
       [ 40.6 ],
       [ 32.54],
       [ 31.62],
       [ 24.64],
       [ 22.4 ],
       [ 34.42],
       [ 24.7 ],
       [ 29.24],
       [ 26.94],
       [ 27.62],
       [ 33.16],
       [ 20.  ],
       [ 22.28],
       [ 20.32],
       [ 20.52],
       [ 33.06],
       [ 25.18],
       [ 24.24],
       [ 17.48],
       [ 25.6 

In [11]:
columns = ["testing","prediction","diff"]
index = range(len(testing))
results = pd.DataFrame(index=index, columns=columns)

results['prediction'] = predictions

results = results.reset_index(drop=True)
testing = testing.reset_index(drop=True)
results['testing'] = testing["MDEV"]

results['diff'] = results['testing'] - results['prediction']
results['pct'] = results['diff'] / results['testing']
results.mean()

testing       21.269412
prediction    21.205882
diff           0.063529
pct           -0.050408
dtype: float64