In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Load Data

In [2]:
df = pd.read_csv('./house_prices.csv')

# Select target
y = df['Price']

# Choose "Features"
x = df.drop(['Price'], axis=1)

# Remove outliers from data set
# from scipy import stats
# outliers = (np.abs(stats.zscore(y)) < 3)
# x = x[outliers.all(axis=1)]
# y = y[outliers.all(axis=1)]

# Split features into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

## Build Random Forest Model

In [3]:

from functions import currency
from functions import build_score_randomforest

## Numeric Features

In [4]:
col_num = [col for col in x.columns if x[col].dtype in ['int64', 'float64']]
x_num = x[col_num]


## How do we deal with missing data?

### Approach 1: Drop columns with missing values

In [5]:
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.dropna(axis='columns')
x_num_test = x_num_test.dropna(axis='columns')
print('MAE from Approach 1 (Drop features with missing values):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

MAE from Approach 1 (Drop features with missing values):


176556.1092096132

### Approach 2: Fill missing data values with imputation

In [6]:
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(method='ffill')
x_num_test = x_num_test.fillna(method='ffill')
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

# Replace with 0
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(0)
x_num_test = x_num_test.fillna(0)
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)


# Replace with mean
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(x_num_train.mean())
x_num_test = x_num_test.fillna(x_num_train.mean())
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

MAE from Approach 2 (Replace missing features with foward fill):


172541.71958447297

MAE from Approach 2 (Replace missing features with foward fill):


167656.98217318885

MAE from Approach 2 (Replace missing features with foward fill):


166170.5766405428

In [7]:
# Going forward, let ups replace all missing values with the column mean in the original data

x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_train[col_num] = x_num_train[col_num]
x_test[col_num] = x_num_test[col_num]

## Notes

### Ordinal 

Non-numeric values i.e. strings
You would not use string data just on its own, but you might be able to convert the string to some other useful information. i.e. converting date string to DOW

### Non-Ordinal

numeric values i.e. ints, floats