## Notes

### Ordinal 

Non-numeric values i.e. strings
You would not use string data just on its own, but you might be able to convert the string to some other useful information. i.e. converting date string to DOW

### Non-Ordinal

numeric values i.e. ints, floats

In [79]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Load Data

In [80]:
df = pd.read_csv('./house_prices.csv')

# Select target
y = df['Price']

# Choose "Features"
x = df.drop(['Price'], axis=1)

# Remove outliers from data set
# from scipy import stats
# outliers = (np.abs(stats.zscore(y)) < 3)
# x = x[outliers.all(axis=1)]
# y = y[outliers.all(axis=1)]

# Split features into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

## Build Random Forest Model

In [81]:

from functions import currency
from functions import build_score_randomforest

## Numeric Features

In [82]:
col_num = [col for col in x.columns if x[col].dtype in ['int64', 'float64']]
x_num = x[col_num]

## How do we deal with missing data?

### Approach 1: Drop columns with missing values

In [83]:
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.dropna(axis='columns')
x_num_test = x_num_test.dropna(axis='columns')
print('MAE from Approach 1 (Drop features with missing values):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

MAE from Approach 1 (Drop features with missing values):


176556.1092096132

### Approach 2: Fill missing data values with imputation

In [84]:
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(method='ffill')
x_num_test = x_num_test.fillna(method='ffill')
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

# Replace with 0
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(0)
x_num_test = x_num_test.fillna(0)
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)


# Replace with mean
x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_num_train = x_num_train.fillna(x_num_train.mean())
x_num_test = x_num_test.fillna(x_num_train.mean())
print('MAE from Approach 2 (Replace missing features with foward fill):')
build_score_randomforest(x_num_train, y_train, x_num_test, y_test)

MAE from Approach 2 (Replace missing features with foward fill):


172541.71958447297

MAE from Approach 2 (Replace missing features with foward fill):


167656.98217318885

MAE from Approach 2 (Replace missing features with foward fill):


166170.5766405428

(10864, 12)

(10864,)

(2716, 12)

(2716,)

In [85]:
# Going forward, let ups replace all missing values with the column mean in the original data

x_num_train, x_num_test, y_train, y_test = train_test_split(x_num, y, test_size=0.2, random_state=1)
x_train[col_num] = x_num_train.fillna(x_num_train.mean())
x_test[col_num] = x_num_test.fillna(x_num_train.mean())

## Non-numeric Features

In [86]:
x.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Latitude         float64
Longitude        float64
Regionname        object
Propertycount    float64
dtype: object

In [87]:
col_obj = x.select_dtypes(include='object')
col_obj = col_obj.loc[:, col_obj.nunique() < 10]

col_cat = list(col_obj.columns)
col_cat

['Type', 'Method', 'Regionname']

In [91]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

xle_train = x_train.copy()
xle_test = x_test.copy()
xle_train.shape
xle_test.shape

for col in col_cat:
    xle_train[col] = le.fit_transform(x_train[col])
    xle_test[col] = le.transform(x_test[col])

xle_train[col_num+col_cat].shape
xle_test[col_num+col_cat].shape

print('MAE from Label Encoding Categorical Columns:')
build_score_randomforest(xle_train[col_num+col_cat], y_train, xle_test[col_num+col_cat], y_test)

(10864, 20)

(2716, 20)

(10864, 15)

(2716, 15)

MAE from Label Encoding Categorical Columns:


156274.91509818358

## Build a Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=5000, learning_rate=.01, max_depth=5, random_state=1)
gbr.fit(xle_train[col_num+col_cat], y_train)
y_test_predict = gbr.predict(xle_test[col_num+col_cat])
mae = mean_absolute_error(y_test, y_test_predict)

print('MAE from Gradient Boosting Model:')
print(mae)

In [None]:
# EXTREME

from xgboost import XGBRegressor

gbr = XGBRegressor(n_estimators=5000, learning_rate=.01, max_depth=5, random_state=1)
gbr.fit(xle_train[col_num+col_cat], y_train)
y_test_predict = gbr.predict(xle_test[col_num+col_cat])
mae = mean_absolute_error(y_test, y_test_predict)

print('MAE from Gradient Boosting Model:')
print(mae)