# Predicting Numbers of House Sales in England and Wales

## k-Nearest Neighbour Regressor

### Preamble

In [1]:
# Configure libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error



In [3]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs = 1, n_folds = 5, score_func = None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv = n_folds, n_jobs = n_jobs, scoring = score_func)
    else:
        gs = GridSearchCV(clf, param_grid = parameters, n_jobs = n_jobs, cv = n_folds)
    gs.fit(X, y)
    print("BEST", gs.best_params_, gs.best_score_, gs.grid_scores_)
    best = gs.best_estimator_
    return best

### Read in the data

In [4]:
# Each line is of the format:
df_house = pd.read_csv("df_house data.csv")
names = ["year", "freq", "day", "month", "weekday", "week", "weekend"]
df_house = df_house[names]
print(df_house.shape)

(8548, 7)


### Define train and test sets

In [5]:
itrain, itest = train_test_split(range(df_house.shape[0]), train_size = 0.8)
mask = np.ones(df_house.shape[0], dtype = 'int')
mask[itrain] = 1
mask[itest] = 0
mask = (mask == 1)
mask[:10]

array([False,  True,  True, False,  True,  True,  True,  True,  True, False], dtype=bool)

### Final preparation for machine learning

In [6]:
# Split off the features
Xnames = ["year", "day", "month", "weekday", 
          "week", "weekend"]
X = df_house[Xnames]

# Split off the target (which will be the logarithm of the number of house sales (+1))
y = np.log10(df_house['freq']+1)

In [7]:
X.head()
X.info() # http://pandas.pydata.org/pandas-docs/stable/faq.html

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8548 entries, 0 to 8547
Data columns (total 6 columns):
year       8548 non-null int64
day        8548 non-null int64
month      8548 non-null int64
weekday    8548 non-null int64
week       8548 non-null int64
weekend    8548 non-null int64
dtypes: int64(6)
memory usage: 400.8 KB


### Get the train and test sets

In [8]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
Xtrain.head()

Unnamed: 0,year,day,month,weekday,week,weekend
1,1995,2,1,1,1,0
2,1995,3,1,2,1,0
4,1995,5,1,4,1,0
5,1995,6,1,5,1,0
6,1995,7,1,6,1,1


### k-Nearest Neighbours Regression

In [9]:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()

In [10]:
# Normalise the data
Xtrain_mean = Xtrain.mean()
Xtrain_std_dev = Xtrain.std()
Xtrain_normalized = (Xtrain - Xtrain_mean)/Xtrain_std_dev
Xtest_normalized = (Xtest - Xtrain_mean)/Xtrain_std_dev

In [11]:
%%time
# Define a grid of parameters over which to optimize the knn regressor
# We will figure out which number of neighbors is optimal
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, Xtrain_normalized, ytrain, score_func='mean_squared_error')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


BEST {'n_neighbors': 5} -0.12491246634550146 [mean: -0.17901, std: 0.02358, params: {'n_neighbors': 1}, mean: -0.15235, std: 0.01450, params: {'n_neighbors': 2}, mean: -0.12491, std: 0.01422, params: {'n_neighbors': 5}]
Wall time: 387 ms


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


### k-Nearest Neighbours Regression

In [12]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
knn_reg = knn_best.fit(Xtrain_normalized, ytrain)
knn_training_accuracy = knn_reg.score(Xtrain_normalized, ytrain)
knn_test_accuracy = knn_reg.score(Xtest_normalized, ytest)
print("############# based on standard predict ################")
print("R^2 on training data: %0.4f" % (knn_training_accuracy))
print("R^2 on test data:     %0.4f" % (knn_test_accuracy))

############# based on standard predict ################
R^2 on training data: 0.9344
R^2 on test data:     0.8990


In [None]:
# Show some of the predictions vs. the real number of sales
np.round(np.power(10,np.column_stack((knn_reg.predict(Xtest_normalized),ytest))) - 1,decimals=0).astype(int)

In [None]:
# Calculate the Root Mean Squared Error
np.sqrt(mean_squared_error(knn_reg.predict(Xtest_normalized),ytest))