# House Prices - Feature Engineering

Trying to improve the Root Log Mean squared error result by engineering the features

In [1]:
import os
import platform
import pandas as pd
import numpy as np

# Check which platform is running the notebook
if platform.system() == 'Windows':
    PROJECT_PATH = "\\".join(os.getcwd().split('\\')[:-1])
else:
    # Assuming a Unix based platform
    PROJECT_PATH = "/".join(os.getcwd().split('/')[:-1])

DATA_PATH = os.path.join(PROJECT_PATH, 'data')
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train.csv')

# Load the training dataset
house_prices_train = pd.read_csv(TRAIN_DATA_PATH)
house_prices_train = house_prices_train.drop('Id', axis=1)
house_prices_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Remove Missing values

Remove missing features that are missing 70% of thier total number of records

In [15]:
data_features = house_prices_train.columns
missing_record_threshold = 0.7

# Isolate features that have greater than 70% of their features missing
features_to_remove = []
for feature in data_features:
    feature_data = house_prices_train[house_prices_train[feature].isna()]
    missing_record_count = feature_data.shape[0]
    missing_record_ratio = missing_record_count / house_prices_train.shape[0]
    if missing_record_ratio > missing_record_threshold:
        features_to_remove.append(feature)
        print('{}: {:.2f}%'.format(feature, missing_record_ratio * 100))

Alley: 93.77%
PoolQC: 99.52%
Fence: 80.75%
MiscFeature: 96.30%


In [12]:
# Create a copy of the original data and drop the ones that are missing
training_data_df = house_prices_train.drop(features_to_remove, axis=1)

In [13]:
training_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 no

## Removing Correlated Features

## Numeric Features

Working on the numeric features to understand where the data can be improved

In [16]:
numeric_data_df = training_data_df.select_dtypes(['int64', 'float64'])
numeric_data_df.shape

(1460, 37)

In [19]:
numeric_cols = numeric_data_df.columns