# House prices

## Configuration variables

In [1]:
DATASETS_FOLDER = './datasets/'
TRAIN_FILENAME = 'house_prices_train.csv'
TEST_FILENAME = 'house_prices_test.csv'
OUTPUT_FOLDER = './output/'

## Prepare the notebook

Enable some Jupyter magick:

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Import scientific libraries:

In [3]:
import numpy as np
import pandas as pd

Useful functions:

In [4]:
def save_predictions(name, preds):
    filename = 'house_prices_{}.csv'.format(name)
    with open(os.path.join(OUTPUT_FOLDER, filename), 'w') as f:
        print("Id,SalePrice", file=f)
        for i, p in preds:
            print('{:d},{:f}'.format(i, p), file=f)

## Load the data

The data should be downloaded from the followink links and placed in `{{DATASETS_FOLDER}}` with the indicated name:

- [{{TRAIN_FILENAME}}](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv)
- [{{TEST_FILENAME}}](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv)

 Read the CSV files:

In [5]:
import os

train_csv = pd.read_csv(os.path.join(DATASETS_FOLDER, TRAIN_FILENAME))
test_csv = pd.read_csv(os.path.join(DATASETS_FOLDER, TEST_FILENAME))
all_csv = pd.concat([train_csv, test_csv])

Create a dataframe with all the data:

In [6]:
train = train_csv.copy()
test = test_csv.copy()
data = all_csv.copy()

Prepare a function to re-split train and test data after the preprocessing:

In [7]:
def split_train_test(df):
    train = df[~pd.isnull(df['SalePrice'])]
    train_labels = train['SalePrice'].ravel()
    train_data = train.drop('SalePrice', axis=1).as_matrix()
    test_data = df[pd.isnull(df['SalePrice'])].drop('SalePrice', axis=1).as_matrix()
    return train_data, train_labels, test_data

## Explore the data

Length of the data:

In [8]:
print("Train set: {}".format(len(train)))
print("Test set: {}".format(len(test)))

Train set: 1460
Test set: 1459


## Data preprocessing

In [152]:
from sklearn.preprocessing import Imputer, LabelEncoder

data = all_csv.copy()

# columns_types = data.columns.to_series().groupby(data.dtypes).groups

cat_columns = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond',
'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation',
'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope',
'LotConfig', 'LotShape', 'MSSubClass', 'MSZoning', 'MasVnrType', 'MiscFeature',
'MoSold', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle',
'SaleCondition', 'SaleType', 'Street', 'Utilities']

lin_columns = ['BsmtUnfSF', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'EnclosedPorch', 'Fireplaces',
'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath',
'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea', 'MiscVal',
'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'ScreenPorch',
'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd',
'YrSold']

for column in cat_columns:
    data[column].fillna('NAN')
    onehot = pd.get_dummies(data[column])
    data = pd.concat([data, onehot], axis=1, join_axes=[data.index])
    data.drop(column, axis=1, inplace=True)
    
for column in lin_columns:
    data[column] = data[column].fillna(data[column].mean())

## Build the Model

Split train/test data and labels:

In [153]:
train_data, train_labels, test_data = split_train_test(data)

Prepare the loss function as in kaggle:

In [216]:
from sklearn.metrics import mean_squared_error
def scorer(model, X, y):
    preds = np.abs(model.predict(X))
    return sqrt(mean_squared_error(np.log(preds), np.log(y)))

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV, KFold

params = {
    'n_estimators': [5, 10, 20],
    'criterion': ['mse', 'mae'],
    'max_features': ['auto', 'sqrt', 'log2'],
}

grid_search = GridSearchCV(ExtraTreesRegressor(), param_grid=params, cv=10, scoring=scorer)
grid_search.fit(train_data, train_labels)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Extract and train the best classifier found:

In [218]:
from sklearn.base import clone

model = grid_search.best_estimator_
model = clone(model)
model.fit(train_data, train_labels)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)

### Neural network

In [212]:
import tensorflow as tf
from keras import backend as K

sess = tf.InteractiveSession()
K.set_session(sess)

In [213]:
from sklearn.model_selection import train_test_split

X, V, y_, yv_ = train_test_split(train_data, train_labels, test_size=0.2)

In [219]:
%%capture
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization

model = Sequential()
model.add(BatchNormalization(input_shape=(315,)))
model.add(Dense(512))
model.add(BatchNormalization())
model.add(Dense(128))
model.add(Dense(1))
model.add(Activation('relu'))
model.summary()

def lrmse(y, y_):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.sub(tf.log(y+e),tf.log(y_+e)))))

model.compile(optimizer=RMSprop(0.1), loss='msle', metrics=[lrmse])

with tf.device('/gpu:0'):
    model.fit(X, y_, validation_data=(V, yv_), nb_epoch=30)

In [220]:
with tf.device('/gpu:0'):
    preds = np.abs(model.predict(V))
    print(sqrt(mean_squared_error(np.log(preds.ravel() + 1e-50), np.log(yv_))))

0.225081043953


In [191]:
%%capture
with tf.device('/gpu:0'):
    model.fit(V, yv_, nb_epoch=30)

## Predictions on test data

Make the predictions:

In [163]:
preds = model.predict(test_data)
preds = preds.ravel()

Save predictions to a file:

In [164]:
import time
from datetime import datetime

ctime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
save_predictions(ctime, zip(test_csv['Id'], preds))