In [1]:
import pandas as pd
import numpy as np
import pickle

from preprocessing import *
from scipy import spatial
from sklearn.ensemble import RandomForestRegressor as RFR
import forestci as fci

Failed to import duecredit due to No module named 'duecredit'


# Data
The data contains only columns that we determined to be relevant for the randomforestregressor as well as a couple of potential comparable features, each of these are specified below.

Since the model had some disappointing results for logarithmic price, we also see how well it performs on regular price

In [2]:
df = pickle.load(open('../Data/reduced_df.p','rb'))
df = df.rename(columns={'V1.x': 'Postcode5'})
# df['endprice'] = np.exp(df['endprice'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66239 entries, 57585 to 43850
Data columns (total 17 columns):
livingspace                             66239 non-null float64
Gemiddelde woningwaarde:x 1 000 euro    66239 non-null float64
housetype                               66239 non-null category
Postcode5                               66239 non-null int64
lotsurface                              66239 non-null float64
yearofconstruction                      66239 non-null float64
longitude                               66239 non-null float64
latitude                                66239 non-null float64
housesubtype                            66239 non-null category
rooms                                   66239 non-null float64
bathroom.badkamer                       66239 non-null int64
feature.zwembad                         66239 non-null bool
bathroom.aparte toilet                  66239 non-null float64
balcony.balkon                          66239 non-null bool
feature.sauna

# Relevant Features
These are the 10 features that are most relevant for the random forest regressor as determined in previous notebooks. The user can enter these features manually, but for the purpose of this test, we pick them from the data.

In [4]:
RF = ['livingspace', 'Gemiddelde woningwaarde:x 1 000 euro', 'housetype', 'Postcode5',
      'lotsurface', 'yearofconstruction', 'longitude', 'latitude', 'housesubtype', 'rooms']

For the purpose of this notebook, we only look at 1 potential upgradable feature, namely the presence of a seperate toilet in the bathroom.

In [5]:
PUF = ['bathroom.toilet']

Finally of course, we need the endprice as a target feature, to train our model and check our performance

In [6]:
TF = ['endprice']

We only keep the relevant features in our dataframe

In [7]:
df = df[RF+PUF+TF]
# df = df[RF+TF]

## Calculating values from user input
One of the most important features for our predictive algorithm is de average value of houses in the neighbourhood. The user might not be aware of this value, but fortunately, we can predict it quite accurately by finding the house with the closest longitude, latitude and Postcode5 and simply taking their value instead.

In [8]:
CF = ['longitude', 'latitude', 'Postcode5']

def calculate_avg_housevalue(long, lat, post, data=df):
    tree = spatial.KDTree(data[CF].values)
    _, index = tree.query(np.array([long, lat, post]))
    return data.iloc[index]['Gemiddelde woningwaarde:x 1 000 euro']

# The model
We use a a random forest regression which was also used in previous notebooks to find the most important features.

In [9]:
m = RFR(n_estimators=500, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=500, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)

In [10]:
X, y, _ = proc_df(df, 'endprice')

n_trn = len(X) // 2
n_valid = n_trn + (len(X) // 4)
X_train, X_valid, X_test = split_vals_test(X, n_trn, n_valid)
y_train, y_valid, y_test = split_vals_test(y, n_trn, n_valid)

In [11]:
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

[0.10222628675204434, 0.19078702056728092, 0.9529658867845587, 0.8449251019087404, 0.8675266252812308]


In [12]:
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

[0.10214532465905223, 0.19057170705627877, 0.9530403582816598, 0.8452749252837346, 0.8678086301872148]


# Predicting
Given the initial 10 values of our house + the number of seperate toilets in bathbathroom.toilet, we make a prediction of the end price with the current number of toilets and with one extra toilet.

Our validation set has 76 houses with 0 toilets and 25 houses with 1 toilet.

Below code demonstrates what would happen for a single house (e.g. one entered by the user)

First we look at our prediction for the house with the actual number of toilets it has (0)

In [13]:
pred_log_price = m.predict(X_test)
log_price = y_test
# error_margin = fci.random_forest_error(m, X_train, test, calibrate=False)[-1]

We now expect our prediction to be somewhere between pred_price + error_margin and pred_price - error_margin. Note that these error margins correspond to single sigma so a ~65% confidence interval, for a ~95% confidence interval, we would need to do the error_margins times 2.

print("Lower bound: ", pred_log_price-abs(error_margin))
print("Actual price: ", log_price)
print("Predicted price: ", pred_log_price)
print("Upper bound: ", pred_log_price+abs(error_margin))

Next we look at our predicion if it the user house had an extra toilet (1)

In [14]:
X_test2 = X_test.copy()
X_test2['bathroom.toilet'] += 1

In [15]:
pred_log_price_2 = m.predict(X_test2)
# error_margin_2 = fci.random_forest_error(m, X_train, X_test2)

print("Lower bound: ", pred_log_price)
print("Predicted price: ", pred_log_price)
print("Upper bound: ", pred_log_price)

In [22]:
(np.exp(pred_log_price_2) - np.exp(pred_log_price)).mean()

1638.5595855727677

In [17]:
fi = rf_feat_importance(m, X_train)

In [18]:
fi

Unnamed: 0,cols,imp
0,livingspace,0.426091
1,Gemiddelde woningwaarde:x 1 000 euro,0.155899
2,housetype,0.103665
4,lotsurface,0.082115
3,Postcode5,0.055502
5,yearofconstruction,0.048882
6,longitude,0.040367
7,latitude,0.038325
8,housesubtype,0.025422
9,rooms,0.02016


## Testing the model for the full validation set

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66239 entries, 57585 to 43850
Data columns (total 12 columns):
livingspace                             66239 non-null float64
Gemiddelde woningwaarde:x 1 000 euro    66239 non-null float64
housetype                               66239 non-null category
Postcode5                               66239 non-null int64
lotsurface                              66239 non-null float64
yearofconstruction                      66239 non-null float64
longitude                               66239 non-null float64
latitude                                66239 non-null float64
housesubtype                            66239 non-null category
rooms                                   66239 non-null float64
bathroom.toilet                         66239 non-null int64
endprice                                66239 non-null float64
dtypes: category(2), float64(8), int64(2)
memory usage: 5.7 MB


In [20]:
df.housetype.value_counts()

eengezinswoning    41473
appartement        17094
herenhuis           2792
villa               2065
bungalow            1394
woonboerderij        948
landhuis             370
grachtenpand          79
woonboot              13
landgoed               8
stacaravan             3
Name: housetype, dtype: int64