**All steps related to data cleaning for assignment 2**

In [1]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import matplotlib.pyplot as plt

from sklearn import metrics
from preprocessing import *
from sklearn.ensemble import RandomForestRegressor as RFR

**Import & Sort by sale date**

In [2]:
houses = pd.read_csv('../../Data/houses-preprocessed-buurten.csv', encoding='latin-1', parse_dates=['startdate','enddate'])
houses.sort_values(['enddate'], ascending=[True], inplace=True)

**Drop unneeded or unwanted columns**

In [7]:
houses = houses.drop(['Unnamed: 0'], axis=1)
# Next we drop the duration column to prevent data leakage (we won't know these things beforehand)
# Later, we'll also drop enddate but for now we want to keep for sorting
houses = houses.drop(['duration'], axis=1)
houses = houses.drop(houses[houses.endprice.isnull()].index) # drop rows where the target column is null
# Finally, we also drop the startprice column, but first, we store the index of all rows where the end price
# was not equal to the start price
price_change_index = houses[houses.startprice != houses.endprice].index
houses = houses.drop(['startprice'], axis=1)

**Inspect the dataset for numericals that should be categorized**

In [8]:
display_all(houses.dtypes)

buurt_code                                                       object
V1.x                                                              int64
endprice                                                        float64
url                                                              object
realtor                                                          object
buurt_naam                                                       object
wijk_code                                                        object
bathroom.ligbad                                                   int64
bathroom.douche                                                   int64
bathroom.toilet                                                   int64
bathroom.jacuzzi                                                  int64
bathroom.sauna                                                    int64
bathroom.zitbad                                                   int64
bathroom.stoomcabine                                            

In [9]:
houses['housenumber'] = houses['housenumber'].apply(lambda x: str(x))
houses['postcodealphadeel'] = houses['postcode'].apply(lambda x: x[4:])
houses['postcodenummerdeel'] = houses['postcodenummerdeel'].apply(lambda x: str(x))
houses['Meest voorkomende postcode:code'] = houses['Meest voorkomende postcode:code'].apply(lambda x: str(x))

houses = houses.drop(['postcode'],axis=1) # drop as it's no longer needed

**Inspect for suspicious values and set to null**

In [4]:
# We are looking for things like values with 9999... or negative values where they don't make sense
display_all(houses['enddate'].describe())

count                   78800
unique                    754
top       2017-02-01 00:00:00
freq                      778
first     2013-01-01 00:00:00
last      2018-05-03 00:00:00
Name: enddate, dtype: object

suspicious columns:
- tuinoppervlakte 
- autoplekken
- bedrooms (this turned out to be benign)
- bathroom.aparte toilet
- lotsurface

In [11]:
# tuinoppervlakte
idx = houses[houses.tuinoppervlakte == 999].index
houses.loc[idx,'tuinoppervlakte'] = None

# autoplekken
idx = houses[houses.autoplekken == 99].index
houses.loc[idx,'autoplekken'] = None

# bathroom.apart toilet
idx = houses[houses['bathroom.aparte toilet'] == 9].index
houses.loc[idx,'bathroom.aparte toilet'] = None

# lotsurface
idx = houses[houses['lotsurface'] == 999999].index
houses.loc[idx,'lotsurface'] = None
idx = houses[houses['lotsurface'] == 99999].index
houses.loc[idx,'lotsurface'] = None

**Creating features from the startdate column**

In [12]:
add_datepart(houses, 'startdate') # creates features such as dayofweek, month, year, quarter, etc.

**Create features from the realtors dataset**

realtors = pd.read_csv('../Datarealtors.csv', encoding='latin-1')
realtors = realtors.drop(['house'],axis=1)
reviews = pd.read_csv('reviews.csv', encoding='latin-1')
review_mean = reviews.groupby('url')['Deskundigheid', 'Lokale marktkennis', 'Prijs/kwaliteit',
       'Service/begeleiding'].mean() # computes mean for all reviews grouped by realtor
review_mean.columns = review_mean.columns + '_mean'
review_std = reviews.groupby('url')[
    'Bereikbaarheid en communicatie','Deskundigheid', 'Lokale marktkennis', 'Prijs/kwaliteit','Service/begeleiding',
    'Onderhandeling en resultaat'
].std() # computes standard deviation for all reviews grouped by realtor
review_std.columns = review_std.columns + '_std'
realtors = pd.merge(realtors, review_mean, left_on='realtor', right_index=True)
realtors = pd.merge(realtors, review_std, left_on='realtor', right_index=True)

missing = houses[~houses.realtor.isin(realtors.realtor)].realtor.unique()
extra=np.array([[missing[i]] + [None for n in range(17)] for i in range(len(missing))])
realtors_extra = pd.DataFrame(data=extra, columns=realtors.columns)
realtors = realtors.append(realtors_extra)
realtors = realtors.set_index(np.arange(len(realtors)))

houses = pd.merge(houses, realtors, left_on='realtor', right_on='realtor')

**Convert all categorical features to the pandas 'cat' data type**

In [14]:
# This makes them easy to process by the random forest algorithm
train_cats(houses)

**Finally transform the target variable using np.log**

In [15]:
# The advantage of this is we found it to give more accurate predictions, the disadvantage is that those
# numbers will be less round than if left untransformed, meaning it won't be a "normal" house price, so to speak
houses.endprice = np.log(houses.endprice)

**Impute all missing values using the mean for numerical features and mode for categorical ones**

In [16]:
houses = houses.drop(['enddate'], axis=1)

In [17]:
X, y, nas = proc_df(houses, 'endprice')

**Pickle files for later**

In [18]:
pickle.dump(X,open('X.p','wb'))
pickle.dump(y,open('y.p','wb'))
pickle.dump(nas,open('nas.p','wb'))
pickle.dump(price_change_index,open('pci.p','wb'))

In [19]:
pickle.dump(houses, open('houses.p','wb'))