In [27]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from math import sqrt
import os
from pathlib import Path

import pandas as pd
import numpy as np
from IPython.display import display, FileLink
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# House prices predictions: random forest

For more information about the feature engineering in this notebook, see [./linear-regression.ipynb](Linear Regression walkthrough).

## 1. Download dataset

In [4]:
PATH = Path('./data')
PATH.mkdir(exist_ok=True)

In [5]:
!kaggle competitions download -c house-prices-advanced-regression-techniques -p {PATH}

data_description.txt: Downloaded 13KB of 13KB to data
train.csv.gz: Downloaded 89KB of 89KB to data
train.csv: Downloaded 450KB of 450KB to data
test.csv.gz: Downloaded 82KB of 82KB to data
test.csv: Downloaded 441KB of 441KB to data
sample_submission.csv.gz: Downloaded 15KB of 15KB to data
sample_submission.csv: Downloaded 31KB of 31KB to data


## 2. Prepare dataset

In [5]:
df_raw = pd.read_csv(PATH / 'train.csv')

### Extract and prepare target values

In [6]:
sale_price = df_raw.pop('SalePrice')
sale_price_log = np.log(sale_price)

### Find columns

In [7]:
house_ids = df_raw.pop('Id')

In [8]:
MAX_N_UNIQUE = 50

continuous_columns = set([
    col_name for col_name, col in df_raw.items()
    if len(col.unique()) > MAX_N_UNIQUE])

In [9]:
continuous_columns = list(continuous_columns | set([
    'LowQualFinSF', 'BsmtHalfBath', 'BsmtFullBath', 'FullBath',
    'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', '3SsnPorch',
    'PoolArea', 'MiscVal', 'YrSold', 'Fireplaces']))

In [10]:
categorical_columns = [col for col in df_raw.columns if col not in continuous_columns]

In [11]:
assert len(df_raw.columns) == len(categorical_columns + continuous_columns)

### Prepare categorical

In [12]:
for col_name, col in df_raw[categorical_columns].items():
    df_raw[col_name] = col.astype('category').cat.as_ordered()

In [13]:
# Quality measures (Excellent, Good, Average, Fair, Poor)
df_raw.ExterQual.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.ExterCond.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.BsmtQual.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.BsmtExposure.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.BsmtFinType1.cat.set_categories(['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf'], ordered=True, inplace=True)
df_raw.BsmtFinType2.cat.set_categories(['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf'], ordered=True, inplace=True)
df_raw.HeatingQC.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.KitchenQual.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.FireplaceQu.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.GarageFinish.cat.set_categories(['Fin', 'Rfn', 'Unf'], ordered=True, inplace=True)
df_raw.GarageQual.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.GarageCond.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa', 'Po'], ordered=True, inplace=True)
df_raw.PoolQC.cat.set_categories(['Ex', 'Gd', 'TA', 'Fa'], ordered=True, inplace=True)

### Prepare continuous

In [14]:
nas = {}

for col in continuous_columns:
    if not pd.isna(df_raw[col]).sum():
        continue
        
    median = df_raw[col].median()
        
    df_raw[f'{col}_na'] = pd.isna(df_raw[col])
    df_raw[col] = df_raw[col].fillna(median)
    
    nas[col] = median

In [15]:
nas

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1980.0}

In [16]:
df_raw['LotArea'].head()

0     8450
1     9600
2    11250
3     9550
4    14260
Name: LotArea, dtype: int64

In [17]:
scaler = StandardScaler()
df_raw[continuous_columns] = scaler.fit_transform(df_raw[continuous_columns])

In [18]:
df_raw['LotArea'].head()

0   -0.207142
1   -0.091886
2    0.073480
3   -0.096897
4    0.375148
Name: LotArea, dtype: float64

### Numericalise

In [19]:
df_numeric = df_raw.copy()

for col_name in categorical_columns:
    # Use +1 to push the -1 NaN value to 0
    df_numeric[col_name] = df_numeric[col_name].cat.codes + 1

### Create validation set

In [20]:
X_train, X_val, y_train, y_val = train_test_split(df_numeric.values, sale_price_log, test_size=0.2, random_state=42)

## 3. Train evaluate model

In [45]:
model = LGBMRegressor()

In [46]:
model.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

### Basic evaluation

In [47]:
print(model.score(X_train, y_train))
print(model.score(X_val, y_val))

0.9882413001301917
0.8965714266397214


In [48]:
preds = model.predict(X_val)

In [49]:
rms = sqrt(((y_val - preds) ** 2).mean())
print(f'RMSE: {rms}')

RMSE: 0.13892865061650198


## 4. Submit predictions

We now need to prepare the test set using exactly the same preparation we used with the training data.

In [50]:
df_test_raw = pd.read_csv(PATH / 'test.csv')

In [51]:
house_ids = df_test_raw.pop('Id')

In [52]:
for col_name in categorical_columns:
    df_test_raw[col_name] = (
        pd.Categorical(df_test_raw[col_name], categories=df_raw[col_name].cat.categories, ordered=True))

In [53]:
for col in continuous_columns:
    if col not in nas:
        continue

    df_test_raw[f'{col}_na'] = pd.isna(df_test_raw[col])
    df_test_raw[col] = df_test_raw[col].fillna(nas[col])

In [54]:
# Handle any other nas
df_test_raw[continuous_columns] = df_test_raw[continuous_columns].fillna(df_test_raw[continuous_columns].median())

In [55]:
df_test_raw[continuous_columns] = scaler.transform(df_test_raw[continuous_columns])

In [56]:
df_test = df_test_raw.copy()

for col_name in categorical_columns:
    # Use +1 to push the -1 NaN value to 0
    df_test[col_name] = df_test[col_name].cat.codes + 1

In [57]:
test_preds = model.predict(df_test)

In [58]:
pd.DataFrame({'Id': house_ids, 'SalePrice': np.exp(test_preds)}).to_csv(f'{PATH}/sub_lgbm.csv', index=False)

In [59]:
FileLink(PATH / 'sub_lgbm.csv')

Slightly better performance than a linear model, bumped up to 1999th place.

<img src="./images/submission-3.png">