# 2. Feature Engineering

Second attempt at building a model to predict house prices. 

In [1]:
import os
import platform
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Check which platform is running the notebook
if platform.system() == 'Windows':
    PROJECT_PATH = "\\".join(os.getcwd().split('\\')[:-1])
else:
    # Assuming a Unix based platform
    PROJECT_PATH = "/".join(os.getcwd().split('/')[:-1])

DATA_PATH = os.path.join(PROJECT_PATH, 'data')
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train.csv')

# Load the training dataset
house_prices_train = pd.read_csv(TRAIN_DATA_PATH)
house_prices_train = house_prices_train.drop('Id', axis=1)
house_prices_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Split into training and testing set

In [2]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
test_size = 0.3
target_feature = 'SalePrice'

X = house_prices_train.drop(target_feature, axis=1)
y = house_prices_train[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=test_size)

In [3]:
def remove_missing_features(X, threshold=0.8, verbose=False):
    features = X.columns
    features_to_remove = []
    dataset_size = X.shape[0]
    for f in features:
        missing_count = X[X[f].isna()].shape[0]
        missing_ratio = missing_count / dataset_size
        if missing_ratio > threshold:
            features_to_remove.append(f) 
            if verbose:
                print("{:14}{:.3f}%".format(f, missing_ratio * 100))
    return X.drop(features_to_remove, axis=1)

X_train_df = remove_missing_features(X_train, threshold=0.5, verbose=True)

Alley         93.542%
PoolQC        99.511%
Fence         80.235%
MiscFeature   96.086%


## Remove Features with a single value and highly correlated features

In [4]:
def remove_single_values(X, verbose=False):
    features = X.columns
    features_to_remove = []
    for f in features:
        value_count = X[f].nunique()
        if value_count == 1:
            features_to_remove.append(f)
            if verbose:
                print('Removing ', f)
    return X.drop(features_to_remove, axis=1)

def remove_highly_correlate_features(X, threshold=0.9):
    corr = X.corr().abs()
    
    # Select upper triangle of correlation matrix
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    to_remove = [column for column in upper.columns if any(upper[column] > threshold)]
    return X.drop(to_remove, axis=1)

X_train_df = remove_single_values(X_train_df)
X_train_df = remove_highly_correlate_features(X_train_df)

## Remove target feature outliers

In [5]:
quantiles = y_train.quantile([0.25, 0.75]).values

# Q1 at index 0 and Q3 at index 1
iqr = quantiles[1] - quantiles[0]
outlier_threshold = quantiles[1] + (1.5 * iqr)
outlier_threshold

342500.0

In [6]:
feature_inliers_indices = np.where(y_train < outlier_threshold)[0]

# Extract the inliers from the dataset
X_train_df = X_train_df.iloc[feature_inliers_indices]
y_train = y_train.iloc[feature_inliers_indices]

# Validate they are the same size 
assert X_train_df.shape[0] == y_train.shape[0]

## Feature Engineering


### Numeric Features

In [7]:
X_train_df.shape

(981, 75)

In [8]:
numeric_features = X_train_df.select_dtypes(['int64', 'float64'])
numeric_features.shape

(981, 36)

In [9]:
numeric_features.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [10]:
numeric_cols = numeric_features.columns.tolist()

Creating new features:

In [11]:
overall_score = (numeric_features.OverallQual + numeric_features.OverallCond) / 2

In [12]:
upper_total_ft = numeric_features['1stFlrSF'] + numeric_features['2ndFlrSF']

In [13]:
future_garage = (numeric_features.GarageYrBlt == numeric_features.YearBuilt).astype(int)

In [14]:
build_in_20_centry = (numeric_features.YearBuilt < 2000).astype(int)
build_in_20_centry.value_counts()

1    734
0    247
Name: YearBuilt, dtype: int64

### Categorical Features

In [15]:
categorical_features = X_train_df.select_dtypes(['object'])
categorical_features.shape

(981, 39)

In [16]:
categorical_cols = categorical_features.columns
categorical_cols

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

Get the ordinal features

In [17]:
# Define the ordinal features
ordinal_cols = [
    'LotShape',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu',
    'GarageQual',
    'GarageCond',
]

# Extract the categorical features from the ordinal features
categorical_cols = categorical_cols[~categorical_cols.isin(ordinal_cols)]

## Processing the data

In [18]:
from sklearn.model_selection import cross_val_score

def cross_val_regression(clf, X, y, cv=3):
    neg_mse_scores = cross_val_score(clf, X, y, scoring='neg_mean_squared_error', cv=cv)
    neg_log_scores = cross_val_score(clf, X, y, scoring='neg_mean_squared_log_error', cv=cv)
    return {
        'rmse': (np.sqrt(-neg_mse_scores)).mean(),
        'rmlse': (np.sqrt(-neg_log_scores)).mean()
    }

In [19]:
from sklearn.base import TransformerMixin

class CreateNumericFeatures(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        overall_score = (numeric_features['OverallQual'] + numeric_features['OverallCond']) / 2
        upper_total_ft = numeric_features['1stFlrSF'] + numeric_features['2ndFlrSF']
        future_garage = (numeric_features['GarageYrBlt'] == numeric_features['YearBuilt']).astype(int)
        build_in_20_centry = (numeric_features['YearBuilt'] < 2000).astype(int)
        X.drop(['OverallQual', 'OverallCond', '1stFlrSF', '2ndFlrSF', 'GarageYrBlt', 'YearBuilt'], axis=1)
        X['OverallScore'] = overall_score
        X['UpperFlrSF'] = upper_total_ft
        X['GarageLater'] = future_garage
        X['BuildIn20Centuary'] = build_in_20_centry
        return X
    

class CombineGarageFeatures(TransformerMixin):
    def __init__(self):
        self.ratings = {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        convert_rating = lambda x : self.ratings.get(x, 0) 
        garage_qual = X['GarageQual'].apply(convert_rating)
        gargage_condition = X['GarageQual'].apply(convert_rating)
        garage_rating = (garage_qual + gargage_condition) / 2
        X['GarageRating'] = garage_rating
        X.drop(['GarageQual', 'GarageQual'], axis=1)
        return X
    
    
class ExteriorCondition(TransformerMixin):
    def __init__(self):
        self.ratings = {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1
        }
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        convert_rating = lambda x : self.ratings.get(x, 0) 
        exterior_qual = X['ExterQual'].apply(convert_rating)
        exterior_condition = X['ExterCond'].apply(convert_rating)
        exterior_rating = (exterior_qual + exterior_condition) / 2
        X['ExterRating'] = exterior_rating
        X.drop(['ExterQual', 'ExterCond'], axis=1)
        return X

In [20]:
numeric_cols = [
     'MSSubClass',
     'LotFrontage',
     'LotArea',
     'YearRemodAdd',
     'MasVnrArea',
     'BsmtFinSF1',
     'BsmtFinSF2',
     'BsmtUnfSF',
     'TotalBsmtSF',
     'LowQualFinSF',
     'GrLivArea',
     'BsmtFullBath',
     'BsmtHalfBath',
     'FullBath',
     'HalfBath',
     'BedroomAbvGr',
     'KitchenAbvGr',
     'TotRmsAbvGrd',
     'Fireplaces',
     'GarageCars',
     'GarageArea',
     'WoodDeckSF',
     'OpenPorchSF',
     'EnclosedPorch',
     '3SsnPorch',
     'ScreenPorch',
     'PoolArea',
     'MiscVal',
     'MoSold',
     'YrSold',
     'OverallScore',
     'UpperFlrSF',
     'GarageRating',
     'ExterRating',
     'BuildIn20Centuary'
]

categorical_features = [
    'LotShape',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu'
]

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log1p))
])

ordinal_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

categorical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


processing_pipeline = Pipeline([
    ('numeric_features', CreateNumericFeatures()),
    ('garage', CombineGarageFeatures()),
    ('extorior', ExteriorCondition()),
    ('processing', ColumnTransformer([
        ('numeric', numeric_pipeline, numeric_cols),
        ('ordinal', ordinal_pipeline, ordinal_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]))
])

In [22]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def root_mean_log_error(y_true, y_pred):
    try:
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
    except ValueError:
        return -1000

## Creating a baseline before Feature Engineering

In [23]:
X_train_processed = processing_pipeline.fit_transform(X_train_df)

In [24]:
X_train_processed.shape

(981, 221)

In [25]:
from sklearn.ensemble import RandomForestRegressor

rand_forest_baseline = RandomForestRegressor(random_state=RANDOM_STATE)
scores = cross_val_regression(rand_forest_baseline, X_train_processed, y_train)

In [26]:
print('Baseline Random Forest RMLSE: {:.3f}'.format(scores['rmlse']))
print('Baseline Random Forest RMSE: {:.3f}'.format(scores['rmse']))

Baseline Random Forest RMLSE: 0.146
Baseline Random Forest RMSE: 22849.456


Our baseline from using ordinal and categorical features (compared to just categorical features) is already matching the best model from the previous notebook.

### K Best Features

In [27]:
X_train_processed.shape

(981, 221)