# Ames Housing Dataset -  Feature Selection

> Gianmaria Pizzo - 872966@stud.unive.it

These notebooks represent the project submission for the course [Data and Web Mining](https://www.unive.it/data/course/337525) by Professor [Claudio Lucchese](https://www.unive.it/data/people/5590426) at [Ca' Foscari University of Venice](https://www.unive.it).

---

## Structure of this notebook

This notebook covers the following points
* Feature Selection (from domain knowledge and previous evidence)
* Feature Importance (assessed through XGBoost)
* Recursive Feature Elimination (assessed through Random Forest)
* Final Dataset Overview

---

### Before running this notebook

To avoid issues, before running the following notebook it is best to
* Clean previous cell outputs
* Restart the kernel

---

###  Environment, Imports and Global Variables

In [1]:
# Interactive
%matplotlib notebook
# Static
# %matplotlib inline

# Environment for this notebook
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import warnings
import IPython

# Set the style for the plots
sns.set()
plt.style.use('ggplot')
sns.set_style("darkgrid")
# Ignore warnings
warnings.filterwarnings('ignore') 

# Working folder
WORKING_DIR = os.getcwd()
# Resources folder
RESOURCES_DIR = os.path.join(os.getcwd(), 'resources')
# Name of file
IN_LABEL = 'ames_housing_out_1.csv'
ORIG_LABEL = 'ames_housing_out_1_orig.csv'

In [2]:
# Utils Module

def sort_alphabetically(dataset, last_label = None):
    """
    Sorts the dataset alphabetically and leaves last_label for last

    :param dataset: a pd.DataFrame
    :param last_label: a str containing an existing column label in the dataset
    :returns: pd.DataFrame
    """
    # Sort
    dataset = dataset.reindex(sorted(dataset.columns), axis=1)
    # Move target column to last index
    if last_label is not None:
        col = dataset.pop(last_label)
        dataset.insert(dataset.shape[1], last_label, col)
    return dataset

In [3]:
# Module for correlation analysis

def display_corr_matrix(dataset: pd.core.frame.DataFrame, label: str, exclude = None):
    """
    Displays the correlation matrix and returns it
    : param dataset: pd.core.frame.DataFrame
    : param label: str representing an existing column
    : param exclude: list of str of features to exclude
    : returns: correlation matrix
    """
    if dataset.empty or not isinstance(dataset, pd.core.frame.DataFrame):
        raise Exception("Dataset is either empty or not a valid pandas DataFrame")
    if not isinstance(label, str):
        raise Exception("label is not a string")
    if label not in dataset.columns:
        raise Exception("label not in dataset")
    if dataset[label].dtype == object:
        raise Exception("Categorical features are not allowed")
    
    if exclude is not None:
        if type(exclude) == list:
            for i in exclude:
                if type(i)!= str:
                    raise Exception("exclude must be a list of string")
                if i not in dataset.columns:
                    raise Exception(i + " is not a valid column label")
        else:
            raise Exception("exclude must be a list")
            
    # Move target column to last index
    col = dataset.pop(label)
    dataset.insert(dataset.shape[1], label, col)

    plt.figure(figsize=(20,20))
    
    matrix = dataset.corr()

    cmap = sns.diverging_palette(250, 15, s=75, l=40,
                                 n=9, center="light", as_cmap=True)
    mask = np.triu(np.ones_like(matrix, dtype=bool))

    sns.heatmap(matrix, mask=mask, center=0, annot=True,
                 fmt='.2f', square=True, cmap=cmap)
    return matrix


def top_correlated_features(corr_matrix, label, threshold=0.45):
    """
    Returns a dictionary of the top correlated features with label, inside the dataset, for a certain threshold
    
    """
    
    m = corr[label].sort_values(ascending=False).to_dict()
    corr_dict = {}
    
    for k in m:
        if label in k:
            continue
        else:
            v = m.get(k)
            if v >= threshold or v <= (-threshold):
                corr_dict.update({k:v})
    return corr_dict


def display_correlation(sample_dict, exclude=None):
    """
    Prints the correlated features for all the top correlated features with the target
    
    """
    for k in sample_dict:
        print("Top correlated features with \"{0}\"".format(k))
        temp = top_correlated_features(corr, k, .60)
        for k in temp:
            if exclude is not None and k in exclude:
                continue
            print("\t{0} : {1}".format(k, temp.get(k)))
        print("\n")
    pass

In [4]:
from sklearn.model_selection import train_test_split

# Module for train test split

def get_X_y(dataset, label, ignore=None):
    """
    Returns X and y and ignores labels in ignore
    :param dataset: a pd.DataFrame
    :param label: a str containing an existing target column label in the dataset
    :param ignore: a list of str containing an existing column label in the dataset to ignore
    :returns: tuple of pd.DataFrame
    """
    if ignore is not None:
        # Drop the labels 
        all_columns = list(dataset.columns)
        # Include only columns that are existing 
        to_drop = [i for i in all_columns if i in ignore] +[label]
        return dataset.drop(columns=to_drop), dataset[[label]]
    return dataset.drop(columns=[label]), dataset[[label]]

def get_train_test(X, y, size = 0.2, state = 33):
    """
    Returns X_train_[size], X_test, y_train_[size], y_test
    :param X: a pd.DataFrame without the target column
    :param y: a pd.DataFrame with one column, the target
    :param size: a float representing the fraction for the test size
    :param state: an integer representing the random state for the test
    :returns: 4 pd.DataFrame usually called "X_train_[size], X_test, y_train_[size], y_test"
    """
    return train_test_split(X, y, test_size=size, random_state = state)

def get_train_val_test(X, y, size_t=0.2, size_v=0.25, state_v = 42):
    """
    Returns X_train, X_valid, X_test, y_train, y_valid, y_test
    :param X: a pd.DataFrame without the target column
    :param y: a pd.DataFrame with one column, the target
    :param size_t: a float representing the fraction for the test size
    :param size_v: a float representing the fraction for the validation
    :param state_v: an integer representing the random state for the validation
    :returns: 6 pd.DataFrame usually called X_train, X_valid, X_test, y_train, y_valid, y_test
    """
    X_train_s, X_test, y_train_s, y_test = get_train_test(X, y, size = size_t)
    X_train, X_valid, y_train, y_valid = get_train_test(X_train_s, y_train_s, size = size_v, state = state_v)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [5]:
from xgboost import XGBRegressor
from xgboost import plot_importance
# Module for feature importances

def plot_feature_importances(model, dataset):
    n_features = x.shape[1]
    plt.barh(range(n_features),model.feature_importances_,align="center")
    plt.yticks(np.arange(n_features),x)
    plt.xlabel("importance")
    plt.ylabel("features")
    plt.show
    pass

def plot_xgb_importance(dataset, target, ignore = None,  subset= None):
    # Data
    X, y = get_X_y(dataset=dataset, 
                   label=target, ignore=ignore)
    X_train, X_valid, X_test, y_train, y_valid, y_test = get_train_val_test(X, y, size_t=0.2, size_v=0.25, state_v = 42)
    # Model
    model = XGBRegressor(n_estimators = 10,
                         max_depth = 12,
                         max_leaves = 12,
                         learning_rate = 0.5, 
                         importance_type = 'weight',
                         sampling_method = 'gradient_based', 
                         random_state = 292359329)
    # Fit
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=False)
    # Importance
    plot_importance(model, max_num_features = 30)
    plt.show()
    return list(model.feature_names_in_)

In [6]:
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor

# Module for RFECV

def rfe_estim(dataset, target, estimator, cv, ignore=None):
    X, y = get_X_y(dataset=dataset, label=target, ignore=ignore)
    selector = RFECV(estimator=estimator, 
                     step=1, 
                     min_features_to_select=30, 
                     cv= cv, 
                     scoring='neg_mean_squared_error', 
                     n_jobs=-1, importance_getter='auto')
    selector = selector.fit(X, y)
    
    print('The optimal number of features is {}'.format(selector.n_features_))
    features = [f for f,s in zip(X.columns, selector.support_) if s]
    print('The selected features are:')
    print ('{}'.format(features))
    return features

In [7]:
df = pd.read_csv(os.path.join(RESOURCES_DIR, IN_LABEL))
df_orig = pd.read_csv(os.path.join(RESOURCES_DIR, ORIG_LABEL))

df.drop(columns='Unnamed: 0', inplace=True)
df_orig.drop(columns='Unnamed: 0', inplace=True)

df = sort_alphabetically(df, 'Sale_Price')
df_orig = sort_alphabetically(df_orig, 'Sale_Price')

---

## Removing obvious misleading and irrelevant attributes

From our previous analysis we have some level of insight about what features we want to consider.
* We are obviously going to remove the `Latitude` and `Logitude` as these feature are most likely leading to overfitting, but they are still needed for the GNN, so we are going to ignore them for now.
* Other features are not really useful from what we learnt and we ar going to show this with feature importance
* Others were the result of the combinations or transformations of initial features

We are going to take into account these information to avoid redundancy in the dataset

---

## Feature Importance

Impurting feature importance through Correlation, Xgboost Importance Estimation, and RFE

### Correlation

In [8]:
corr = display_corr_matrix(sort_alphabetically(df, 'Sale_Price'), label='Sale_Price')

<IPython.core.display.Javascript object>

In [9]:
top_corr_dict = top_correlated_features(corr, 'Sale_Price', 0.40)
top_corr_dict

{'Total_SF': 0.831830413595234,
 'Overall_Qual': 0.8117625341990397,
 'Bsmt_Eval': 0.77765820613712,
 'Gr_Liv_Area': 0.7257798387940613,
 'Exter_Qual': 0.7148047608934632,
 'Kitchen_Qual': 0.6887093084037993,
 'Garage_Area': 0.6681346194103982,
 'Garage_Eval': 0.6680762754078051,
 'Garage_Cars': 0.6669296938018384,
 'Total_Bsmt_SF': 0.6645126924806144,
 'First_Flr_SF': 0.6494304150165117,
 'Baths': 0.6330306213900416,
 'Fireplace_Eval': 0.6277197319771475,
 'Fireplace_Gr_Area_Ratio': 0.6025121788984277,
 'Year_Built': 0.5834995693412479,
 'Total_Bath': 0.5751719494861582,
 'Bedroom_Liv_Area_Ratio': 0.5496091971693476,
 'Gr_Area_Rms_Ratio': 0.5455744720388533,
 'External_Eval': 0.5412391075150473,
 'Full_Bath': 0.5405522072683838,
 'Fireplace_Qu': 0.5341356572440731,
 'Year_Remod_Add': 0.5338731340714287,
 'Mas_Vnr_Area': 0.5143581269042917,
 'TotRms_AbvGrd': 0.5102272169941826,
 'garage_type_1': 0.47413465459280585,
 'Fireplaces': 0.4706721579135879,
 'Mas_Vnr_3': -0.41422570644457873,

It is easy to understand the correlation between these variables and the target feature, as almost each one of the datasets shows almost always the same estimated correlations. However, as the result might be biased by the presence of multiple variables, we want to try another approach.

### XGBoost Regressor - Feature Importance

We can use the XGboost regressor to infer importance of features, as these kinds of trees present a way to infer it based on some metrics such as coverage or weight. 

In [10]:
plot_xgb_importance(dataset = df, target = 'Sale_Price', ignore = 'Log1p_Sale_Price')

<IPython.core.display.Javascript object>

['Age',
 'Alley_Access',
 'BC_Bsmt_Unf_SF',
 'BC_External_SF',
 'Baths',
 'Bedroom_AbvGr',
 'Bedroom_Liv_Area_Ratio',
 'Bsmt',
 'Bsmt_Eval',
 'Bsmt_Full_Bath',
 'Bsmt_Half_Bath',
 'Bsmt_Total_Bath',
 'Bsmt_Unf_SF',
 'Central_Air',
 'Electrical_SBrkr',
 'Exter_Cond',
 'Exter_Qual',
 'External_Eval',
 'External_SF',
 'Fireplace_Eval',
 'Fireplace_Gr_Area_Ratio',
 'Fireplace_Qu',
 'Fireplaces',
 'First_Flr_SF',
 'Floating_Village_Residential',
 'Full_Bath',
 'Garage_Area',
 'Garage_Cars',
 'Garage_Cond',
 'Garage_Eval',
 'Garage_Qual',
 'Gr_Area_Rms_Ratio',
 'Gr_Liv_Area',
 'Half_Bath',
 'Kitchen_AbvGr',
 'Kitchen_Qual',
 'Latitude',
 'Longitude',
 'Lot_Area',
 'Mas_Vnr_1',
 'Mas_Vnr_3',
 'Mas_Vnr_Area',
 'Mo_Sold',
 'Overall_Cond',
 'Overall_Qual',
 'Paved_Drive',
 'Remodeled',
 'Residential_Low_Density',
 'Residential_Medium_Density',
 'Second_Flr_SF',
 'TotBath_LivArea_Ratio',
 'TotRms_AbvGrd',
 'Total_Bath',
 'Total_Bsmt_SF',
 'Total_SF',
 'Two_Flr',
 'Year_Built',
 'Year_Remod_Add',


In [11]:
plot_xgb_importance(dataset = df_orig, target = 'Sale_Price', ignore = 'Log1p_Sale_Price')

<IPython.core.display.Javascript object>

['A_agr',
 'Age',
 'AllPub',
 'Alley_Access',
 'Artery',
 'AsbShng',
 'AsphShn',
 'BC_Bsmt_Unf_SF',
 'BC_External_SF',
 'Baths',
 'Bedroom_AbvGr',
 'Bedroom_Liv_Area_Ratio',
 'Brk Cmn',
 'BrkComm',
 'BrkFace',
 'BrkTil',
 'Bsmt',
 'BsmtFin_SF_1',
 'BsmtFin_SF_2',
 'BsmtFin_Type_1',
 'BsmtFin_Type_2',
 'Bsmt_Cond',
 'Bsmt_Eval',
 'Bsmt_Exposure',
 'Bsmt_Full_Bath',
 'Bsmt_Half_Bath',
 'Bsmt_Qual',
 'Bsmt_Total_Bath',
 'Bsmt_Unf_SF',
 'CBlock',
 'COD',
 'CWD',
 'C_all',
 'CemntBd',
 'Central_Air',
 'ClyTile',
 'CmentBd',
 'CompShg',
 'Con',
 'ConLD',
 'ConLI',
 'ConLw',
 'Corner',
 'CulDSac',
 'Duplex_All_Styles_and_Ages',
 'Electrical_SBrkr',
 'Elev',
 'Enclosed_Porch',
 'Exter_Cond',
 'Exter_Qual',
 'External_Eval',
 'External_SF',
 'FR2',
 'FR3',
 'Feedr',
 'Fence',
 'Fireplace_Eval',
 'Fireplace_Gr_Area_Ratio',
 'Fireplace_Qu',
 'Fireplaces',
 'First_Flr_SF',
 'Flat',
 'Floating_Village_Residential',
 'Floor',
 'Full_Bath',
 'Functional',
 'Gable',
 'Gambrel',
 'Gar2',
 'Garage_Area'

As we predicted, most of the features we showed during the EDA, are relevant!

## Recursive Feature Elimination

Since we have seen that we really want to use the sale price, we prefer to ignore the logarithm transformation to avoid the variance it might cause

In [12]:
xgb = XGBRegressor(n_estimators = 8,
                             max_depth = 7,
                             max_leaves = 11,
                             learning_rate = 0.5, 
                             importance_type = 'weight', 
                             random_state = 292359329)
cv_rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=2652124)

In [13]:
f1 = rfe_estim(df, target = 'Sale_Price', estimator = xgb, cv=cv_rkf, ignore = 'Log1p_Sale_Price')

The optimal number of features is 46
The selected features are:
['Age', 'BC_Bsmt_Unf_SF', 'BC_External_SF', 'Baths', 'Bedroom_AbvGr', 'Bedroom_Liv_Area_Ratio', 'Bsmt_Eval', 'Bsmt_Full_Bath', 'Bsmt_Total_Bath', 'Central_Air', 'Exter_Qual', 'External_Eval', 'Fireplace_Eval', 'Fireplace_Gr_Area_Ratio', 'Fireplace_Qu', 'Fireplaces', 'First_Flr_SF', 'Garage_Area', 'Garage_Cars', 'Garage_Eval', 'Gr_Area_Rms_Ratio', 'Gr_Liv_Area', 'Kitchen_Qual', 'Latitude', 'Longitude', 'Lot_Area', 'Mas_Vnr_Area', 'Mo_Sold', 'Overall_Cond', 'Overall_Qual', 'Paved_Drive', 'Residential_Medium_Density', 'Second_Flr_SF', 'TotBath_LivArea_Ratio', 'TotRms_AbvGrd', 'Total_Bsmt_SF', 'Total_SF', 'Year_Built', 'Year_Remod_Add', 'Year_Sold', 'bldg_type_1', 'garage_type_1', 'hs_style_1', 'neighborhoods_1', 'neighborhoods_3', 'sale_cond_1']


In [14]:
f2 = rfe_estim(df_orig, target = 'Sale_Price', estimator = xgb, cv=cv_rkf, ignore = 'Log1p_Sale_Price')

The optimal number of features is 81
The selected features are:
['Age', 'Alley_Access', 'AsbShng', 'BC_Bsmt_Unf_SF', 'BC_External_SF', 'Baths', 'Bedroom_AbvGr', 'Bedroom_Liv_Area_Ratio', 'Brk Cmn', 'BrkFace', 'BrkTil', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'BsmtFin_Type_1', 'Bsmt_Cond', 'Bsmt_Eval', 'Bsmt_Exposure', 'Bsmt_Qual', 'COD', 'C_all', 'Central_Air', 'CompShg', 'ConLw', 'Enclosed_Porch', 'Exter_Cond', 'External_Eval', 'FR2', 'Fence', 'Fireplace_Eval', 'Fireplace_Gr_Area_Ratio', 'Fireplace_Qu', 'Fireplaces', 'First_Flr_SF', 'Functional', 'Garage_Area', 'Garage_Cars', 'Garage_Cond', 'Garage_Eval', 'Garage_Finish', 'Gr_Area_Rms_Ratio', 'Gr_Liv_Area', 'Half_Bath', 'Heating_QC', 'Kitchen_Qual', 'Land_Contour', 'Latitude', 'Longitude', 'Lot_Area', 'Lot_Frontage', 'Mas_Vnr_Area', 'Mo_Sold', 'One_Story_1945_and_Older', 'Open_Porch_SF', 'Oth', 'Overall_Cond', 'Overall_Qual', 'PConc', 'Remodeled', 'Residential_Medium_Density', 'Screen_Porch', 'Second_Flr_SF', 'Shed', 'TotBath_LivArea_Ratio', 

In [15]:
# The features selected by the xgb combined:
list(set(f1) | set(f2))

['Oth',
 'Functional',
 'Open_Porch_SF',
 'Alley_Access',
 'neighborhoods_1',
 'BC_Bsmt_Unf_SF',
 'Bsmt_Eval',
 'Bsmt_Cond',
 'External_Eval',
 'Mas_Vnr_Area',
 'Lot_Area',
 'Baths',
 'Fireplaces',
 'Garage_Cars',
 'Total_SF',
 'Fireplace_Gr_Area_Ratio',
 'Second_Flr_SF',
 'Exter_Qual',
 'sale_cond_3',
 'Garage_Eval',
 'Paved_Drive',
 'sale_cond_1',
 'Central_Air',
 'Lot_Frontage',
 'Bsmt_Qual',
 'Two_Family_conversion_All_Styles_and_Ages',
 'AsbShng',
 'Half_Bath',
 'Remodeled',
 'BrkFace',
 'Year_Remod_Add',
 'First_Flr_SF',
 'Total_Bath',
 'Gr_Liv_Area',
 'One_Story_1945_and_Older',
 'Brk Cmn',
 'BsmtFin_SF_2',
 'Heating_QC',
 'WD ',
 'Fireplace_Eval',
 'Latitude',
 'Year_Sold',
 'ConLw',
 'FR2',
 'Screen_Porch',
 'Garage_Area',
 'bldg_type_1',
 'Mo_Sold',
 'TotRms_AbvGrd',
 'BsmtFin_Type_1',
 'Overall_Cond',
 'BC_External_SF',
 'Longitude',
 'Exter_Cond',
 'Shed',
 'CompShg',
 'Bsmt_Full_Bath',
 'Residential_Medium_Density',
 'Age',
 'hs_style_1',
 'Year_Built',
 'BsmtFin_SF_1',
 '

As we can see the RFE on the original datasets almost always leads to a higher number of features. Now, for the sake of choosing a good subset of features, we can try with a naive random forest approach:

In [16]:
rf = RandomForestRegressor(n_estimators=10,
                           max_depth=7,
                           bootstrap=True, 
                           n_jobs=-1, 
                           random_state=2546243645, 
                           verbose=0)

In [17]:
f3 = rfe_estim(df, target = 'Sale_Price', estimator = rf, cv=cv_rkf, ignore = 'Log1p_Sale_Price')

The optimal number of features is 53
The selected features are:
['Age', 'BC_Bsmt_Unf_SF', 'BC_External_SF', 'Baths', 'Bedroom_AbvGr', 'Bedroom_Liv_Area_Ratio', 'Bsmt_Eval', 'Bsmt_Full_Bath', 'Bsmt_Total_Bath', 'Bsmt_Unf_SF', 'Central_Air', 'Exter_Cond', 'Exter_Qual', 'External_Eval', 'External_SF', 'Fireplace_Eval', 'Fireplace_Gr_Area_Ratio', 'Fireplaces', 'First_Flr_SF', 'Garage_Area', 'Garage_Cars', 'Garage_Eval', 'Gr_Area_Rms_Ratio', 'Gr_Liv_Area', 'Half_Bath', 'Kitchen_Qual', 'Latitude', 'Longitude', 'Lot_Area', 'Mas_Vnr_1', 'Mas_Vnr_Area', 'Mo_Sold', 'Overall_Cond', 'Overall_Qual', 'Paved_Drive', 'Residential_Low_Density', 'Residential_Medium_Density', 'Second_Flr_SF', 'TotBath_LivArea_Ratio', 'TotRms_AbvGrd', 'Total_Bath', 'Total_Bsmt_SF', 'Total_SF', 'Year_Built', 'Year_Remod_Add', 'Year_Sold', 'bldg_type_1', 'bldg_type_3', 'garage_type_1', 'neighborhoods_1', 'neighborhoods_3', 'neighborhoods_4', 'sale_cond_1']


In [18]:
f4 = rfe_estim(df_orig, target = 'Sale_Price', estimator = rf, cv=cv_rkf, ignore = 'Log1p_Sale_Price')

The optimal number of features is 134
The selected features are:
['Age', 'AllPub', 'Alley_Access', 'Artery', 'AsbShng', 'AsphShn', 'BC_Bsmt_Unf_SF', 'BC_External_SF', 'Baths', 'Bedroom_AbvGr', 'Bedroom_Liv_Area_Ratio', 'Brk Cmn', 'BrkComm', 'BrkFace', 'BrkTil', 'Bsmt', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Bsmt_Cond', 'Bsmt_Eval', 'Bsmt_Exposure', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Bsmt_Qual', 'Bsmt_Total_Bath', 'Bsmt_Unf_SF', 'CBlock', 'COD', 'CWD', 'C_all', 'CemntBd', 'Central_Air', 'CompShg', 'ConLw', 'Corner', 'Duplex_All_Styles_and_Ages', 'Enclosed_Porch', 'Exter_Cond', 'Exter_Qual', 'External_Eval', 'External_SF', 'Fireplace_Eval', 'Fireplace_Gr_Area_Ratio', 'Fireplace_Qu', 'Fireplaces', 'First_Flr_SF', 'Floating_Village_Residential', 'Floor', 'Full_Bath', 'Functional', 'Gable', 'Garage_Area', 'Garage_Cars', 'Garage_Cond', 'Garage_Eval', 'Garage_Finish', 'Garage_Qual', 'GasW', 'Gr_Area_Rms_Ratio', 'Gr_Liv_Area', 'Half_Bath', 'HdBoard', 'Heating_QC

In [19]:
list(set(f3) & set(f4))

['neighborhoods_4',
 'Mo_Sold',
 'TotRms_AbvGrd',
 'Overall_Cond',
 'BC_Bsmt_Unf_SF',
 'Bsmt_Eval',
 'neighborhoods_1',
 'External_Eval',
 'Mas_Vnr_Area',
 'BC_External_SF',
 'Lot_Area',
 'Baths',
 'Longitude',
 'Fireplaces',
 'Exter_Cond',
 'External_SF',
 'Garage_Cars',
 'Total_SF',
 'Bsmt_Full_Bath',
 'Fireplace_Gr_Area_Ratio',
 'Residential_Medium_Density',
 'Age',
 'Second_Flr_SF',
 'Exter_Qual',
 'Garage_Eval',
 'Paved_Drive',
 'Year_Built',
 'Central_Air',
 'Mas_Vnr_1',
 'sale_cond_1',
 'Bsmt_Unf_SF',
 'neighborhoods_3',
 'Half_Bath',
 'Total_Bsmt_SF',
 'Bedroom_AbvGr',
 'Kitchen_Qual',
 'Overall_Qual',
 'TotBath_LivArea_Ratio',
 'Year_Remod_Add',
 'First_Flr_SF',
 'Total_Bath',
 'Gr_Liv_Area',
 'Residential_Low_Density',
 'Bedroom_Liv_Area_Ratio',
 'Fireplace_Eval',
 'Bsmt_Total_Bath',
 'Latitude',
 'Year_Sold',
 'garage_type_1',
 'Gr_Area_Rms_Ratio',
 'Garage_Area',
 'bldg_type_1']

As we predicted the OHE has affected the base estimator.

In [20]:
list((set(f3) & set(f4)) & (set(f3) & set(f4)))

['neighborhoods_4',
 'Mo_Sold',
 'TotRms_AbvGrd',
 'Overall_Cond',
 'BC_Bsmt_Unf_SF',
 'Bsmt_Eval',
 'neighborhoods_1',
 'External_Eval',
 'Mas_Vnr_Area',
 'BC_External_SF',
 'Lot_Area',
 'Baths',
 'Longitude',
 'Fireplaces',
 'Exter_Cond',
 'External_SF',
 'Garage_Cars',
 'Total_SF',
 'Bsmt_Full_Bath',
 'Fireplace_Gr_Area_Ratio',
 'Residential_Medium_Density',
 'Age',
 'Second_Flr_SF',
 'Exter_Qual',
 'Garage_Eval',
 'Paved_Drive',
 'Year_Built',
 'Central_Air',
 'Mas_Vnr_1',
 'sale_cond_1',
 'Bsmt_Unf_SF',
 'neighborhoods_3',
 'Half_Bath',
 'Total_Bsmt_SF',
 'Bedroom_AbvGr',
 'Kitchen_Qual',
 'Overall_Qual',
 'TotBath_LivArea_Ratio',
 'Year_Remod_Add',
 'First_Flr_SF',
 'Total_Bath',
 'Gr_Liv_Area',
 'Residential_Low_Density',
 'Bedroom_Liv_Area_Ratio',
 'Fireplace_Eval',
 'Bsmt_Total_Bath',
 'Latitude',
 'Year_Sold',
 'garage_type_1',
 'Gr_Area_Rms_Ratio',
 'Garage_Area',
 'bldg_type_1']

---

## Feature Selection

Now we shall find a compromise between all the features in order to save just the important ones for the boosted random forest.
For what concerns the GCN, we are going to add some more features since some of them are just the encoding of previous categorical features.

From all the analysis done until now, I personally believe this is the best subset to use, but we need to avoid redundancy.

In [21]:
subset_1 =['BC_Bsmt_Unf_SF',
 'Bedroom_AbvGr',
 'Bsmt_Full_Bath',
 'Bsmt_Unf_SF',
 'Central_Air',
 'External_Eval',
 'Fireplace_Gr_Area_Ratio',
 'Fireplace_Qu',
 'Garage_Area','Garage_Qual','Garage_Cars',
 'Gr_Liv_Area',
 'Kitchen_Qual',
 'Latitude',
 'Longitude',
 'Lot_Area',
 'Mas_Vnr_1','Mas_Vnr_Area',
 'Mo_Sold',
 'Overall_Qual',
 'Paved_Drive',
 'Residential_Low_Density', 'Residential_Medium_Density',
 'TotRms_AbvGrd',
 'Total_Bath',
 'Total_Bsmt_SF',
 'Year_Built',
 'Year_Remod_Add',
 'Year_Sold',
 'bldg_type_1',
 'bldg_type_3',
 'garage_type_1',
 'hs_style_1',
 'neighborhoods_1',
 'neighborhoods_2',
 'neighborhoods_3',
 'neighborhoods_4',
 'sale_cond_1',
 'Sale_Price']


subset_2 = ['Age',
 'BC_Bsmt_Unf_SF',
 'BC_External_SF',
 'Baths',
 'Bedroom_Liv_Area_Ratio',
 'Bsmt_Eval',
 'Bsmt_Unf_SF',
 'Central_Air',
 'Exter_Qual',
 'Fireplace_Eval',
 'Fireplaces',
 'Garage_Eval',
 'Gr_Area_Rms_Ratio',
 'Kitchen_Qual',
 'Latitude',
 'Longitude',
 'Lot_Area',
 'Mas_Vnr_1',
 'Mas_Vnr_Area',
 'Mo_Sold',
 'Overall_Qual',
 'Paved_Drive',
 'Residential_Low_Density',
 'Residential_Medium_Density',
 'TotBath_LivArea_Ratio',
 'TotRms_AbvGrd',
 'Total_SF',
 'Year_Sold',
 'bldg_type_1',
 'bldg_type_3',
 'garage_type_1',
 'hs_style_1',
 'neighborhoods_1',
 'neighborhoods_3',
 'neighborhoods_4',
 'sale_cond_1', 'Sale_Price']

I tried to make the two subsets, use different combinations of the same features.

### One last time - Correlation

In [26]:
display_corr_matrix(sort_alphabetically(df[subset_1], 'Sale_Price'), label='Sale_Price')

<IPython.core.display.Javascript object>

Unnamed: 0,BC_Bsmt_Unf_SF,Bedroom_AbvGr,Bsmt_Full_Bath,Bsmt_Unf_SF,Central_Air,External_Eval,Fireplace_Gr_Area_Ratio,Fireplace_Qu,Garage_Area,Garage_Cars,...,bldg_type_1,bldg_type_3,garage_type_1,hs_style_1,neighborhoods_1,neighborhoods_2,neighborhoods_3,neighborhoods_4,sale_cond_1,Sale_Price
BC_Bsmt_Unf_SF,1.0,0.134923,-0.209821,0.654828,-0.108313,0.069221,0.127849,0.131604,0.113086,0.136743,...,0.145066,-0.175838,0.072912,0.149648,-0.157497,-0.053088,0.103629,0.018686,-0.073918,0.178355
Bedroom_AbvGr,0.134923,1.0,-0.159611,0.177789,-0.034916,0.046173,0.185904,0.063816,0.088519,0.103566,...,0.177683,0.066291,0.030365,-0.039124,0.034117,-0.054505,0.006217,-0.031099,0.02293,0.134824
Bsmt_Full_Bath,-0.209821,-0.159611,1.0,-0.405607,-0.088172,0.192532,0.080138,0.102543,0.17139,0.147713,...,-0.078911,0.005859,0.168383,0.036237,-0.101925,-0.037325,-0.049978,0.0752,-0.01674,0.273858
Bsmt_Unf_SF,0.654828,0.177789,-0.405607,1.0,-0.035141,0.063412,0.121253,0.111745,0.18551,0.201965,...,0.05379,-0.048083,0.077461,0.136558,-0.164546,-0.108754,0.110014,0.002705,-0.216396,0.200255
Central_Air,-0.108313,-0.034916,-0.088172,-0.035141,1.0,-0.126488,-0.16124,-0.1584,-0.254607,-0.252972,...,-0.076708,0.071064,-0.312253,-0.077837,0.230894,-0.037359,-0.116847,-0.093752,0.040201,-0.251287
External_Eval,0.069221,0.046173,0.192532,0.063412,-0.126488,1.0,0.396041,0.344991,0.34773,0.323742,...,0.100092,-0.159699,0.28577,0.108416,-0.266403,-0.075191,0.056326,0.074607,-0.10637,0.541239
Fireplace_Gr_Area_Ratio,0.127849,0.185904,0.080138,0.121253,-0.16124,0.396041,1.0,0.889098,0.396236,0.438463,...,0.132036,-0.175828,0.378247,0.08935,-0.354504,-0.096637,0.191547,0.005209,-0.101923,0.602512
Fireplace_Qu,0.131604,0.063816,0.102543,0.111745,-0.1584,0.344991,0.889098,1.0,0.331981,0.37439,...,0.104056,-0.176736,0.347449,0.077889,-0.315912,-0.110057,0.173276,0.02465,-0.113538,0.534136
Garage_Area,0.113086,0.088519,0.17139,0.18551,-0.254607,0.34773,0.396236,0.331981,1.0,0.890329,...,0.092516,-0.09378,0.389098,0.197814,-0.4118,-0.158335,0.139528,0.131553,-0.233858,0.668135
Garage_Cars,0.136743,0.103566,0.147713,0.201965,-0.252972,0.323742,0.438463,0.37439,0.890329,1.0,...,0.041373,-0.064942,0.41817,0.214302,-0.46487,-0.170317,0.216577,0.132213,-0.237169,0.66693


In [27]:
display_corr_matrix(sort_alphabetically(df[subset_2], 'Sale_Price'), label='Sale_Price')

<IPython.core.display.Javascript object>

Unnamed: 0,Age,BC_Bsmt_Unf_SF,BC_External_SF,Baths,Bedroom_Liv_Area_Ratio,Bsmt_Eval,Bsmt_Unf_SF,Central_Air,Exter_Qual,Fireplace_Eval,...,Year_Sold,bldg_type_1,bldg_type_3,garage_type_1,hs_style_1,neighborhoods_1,neighborhoods_3,neighborhoods_4,sale_cond_1,Sale_Price
Age,1.0,-0.086771,-0.1446,-0.545726,-0.29737,-0.536884,-0.153225,0.359021,-0.611551,-0.307112,...,0.058521,0.125249,-0.031538,-0.589224,-0.357392,0.652524,-0.344409,-0.21891,0.308439,-0.583683
BC_Bsmt_Unf_SF,-0.086771,1.0,0.079418,-0.002123,0.024367,0.27825,0.654828,-0.108313,0.202411,0.14291,...,-0.017541,0.145066,-0.175838,0.072912,0.149648,-0.157497,0.103629,0.018686,-0.073918,0.178355
BC_External_SF,-0.1446,0.079418,1.0,0.302741,0.26806,0.315711,0.041464,-0.099373,0.268662,0.337574,...,0.000509,0.115915,-0.207157,0.255324,0.065961,-0.209164,0.083835,0.066174,-0.044475,0.385545
Baths,-0.545726,-0.002123,0.302741,1.0,0.335688,0.478609,-0.102012,-0.18046,0.463121,0.410843,...,0.02892,-0.107434,0.075532,0.409251,0.175529,-0.448736,0.237305,0.118419,-0.176361,0.633031
Bedroom_Liv_Area_Ratio,-0.29737,0.024367,0.26806,0.335688,1.0,0.463226,0.035949,-0.078621,0.459954,0.39215,...,-0.003901,-0.169049,-0.031959,0.270189,0.103359,-0.334792,0.1266,0.050055,-0.180354,0.549609
Bsmt_Eval,-0.536884,0.27825,0.315711,0.478609,0.463226,1.0,0.360213,-0.226752,0.594268,0.409352,...,-0.012301,0.020873,-0.109525,0.421244,0.226348,-0.436703,0.070929,0.161674,-0.272417,0.777658
Bsmt_Unf_SF,-0.153225,0.654828,0.041464,-0.102012,0.035949,0.360213,1.0,-0.035141,0.261402,0.14867,...,-0.039934,0.05379,-0.048083,0.077461,0.136558,-0.164546,0.110014,0.002705,-0.216396,0.200255
Central_Air,0.359021,-0.108313,-0.099373,-0.18046,-0.078621,-0.226752,-0.035141,1.0,-0.212487,-0.155663,...,0.004599,-0.076708,0.071064,-0.312253,-0.077837,0.230894,-0.116847,-0.093752,0.040201,-0.251287
Exter_Qual,-0.611551,0.202411,0.268662,0.463121,0.459954,0.594268,0.261402,-0.212487,1.0,0.406381,...,-0.005189,-0.03729,-0.112747,0.392884,0.262922,-0.510862,0.184947,0.132228,-0.335487,0.714805
Fireplace_Eval,-0.307112,0.14291,0.337574,0.410843,0.39215,0.409352,0.14867,-0.155663,0.406381,1.0,...,-0.043512,0.131287,-0.176797,0.369507,0.094621,-0.361404,0.183641,0.004041,-0.133048,0.62772


In [28]:
display_corr_matrix(sort_alphabetically(df_orig[subset_1], 'Sale_Price'), label='Sale_Price')

<IPython.core.display.Javascript object>

Unnamed: 0,BC_Bsmt_Unf_SF,Bedroom_AbvGr,Bsmt_Full_Bath,Bsmt_Unf_SF,Central_Air,External_Eval,Fireplace_Gr_Area_Ratio,Fireplace_Qu,Garage_Area,Garage_Cars,...,bldg_type_1,bldg_type_3,garage_type_1,hs_style_1,neighborhoods_1,neighborhoods_2,neighborhoods_3,neighborhoods_4,sale_cond_1,Sale_Price
BC_Bsmt_Unf_SF,1.0,0.141282,-0.209373,0.656611,-0.094314,0.058795,0.121848,0.122529,0.097501,0.119759,...,0.143899,-0.162531,0.057262,0.13492,-0.136714,-0.053477,0.096655,0.005947,-0.054631,0.162569
Bedroom_AbvGr,0.141282,1.0,-0.161084,0.188508,-0.032083,0.050366,0.188608,0.070137,0.073417,0.091135,...,0.143481,0.079306,0.018036,-0.042486,0.026957,-0.05307,0.003486,-0.028476,0.000709,0.143913
Bsmt_Full_Bath,-0.209373,-0.161084,1.0,-0.398629,-0.113226,0.20346,0.091116,0.119347,0.184903,0.162644,...,-0.086659,0.018858,0.175432,0.043031,-0.109659,-0.042999,-0.045609,0.085561,-0.00195,0.275823
Bsmt_Unf_SF,0.656611,0.188508,-0.398629,1.0,-0.026795,0.054194,0.12069,0.108233,0.164837,0.18008,...,0.052899,-0.040101,0.061908,0.119816,-0.148801,-0.107364,0.104836,-0.003453,-0.150107,0.183308
Central_Air,-0.094314,-0.032083,-0.113226,-0.026795,1.0,-0.126144,-0.171926,-0.172574,-0.268515,-0.274966,...,-0.069512,0.058003,-0.317708,-0.077234,0.245098,-0.04096,-0.122906,-0.104111,-0.030158,-0.264506
External_Eval,0.058795,0.050366,0.20346,0.054194,-0.126144,1.0,0.411627,0.347101,0.373576,0.338247,...,0.105756,-0.155519,0.269323,0.102399,-0.245967,-0.080263,0.046443,0.070103,-0.057294,0.541684
Fireplace_Gr_Area_Ratio,0.121848,0.188608,0.091116,0.12069,-0.171926,0.411627,1.0,0.88499,0.404602,0.445729,...,0.137247,-0.175005,0.375703,0.079115,-0.356267,-0.099945,0.186927,0.017978,-0.026016,0.601858
Fireplace_Qu,0.122529,0.070137,0.119347,0.108233,-0.172574,0.347101,0.88499,1.0,0.342602,0.382874,...,0.110664,-0.175791,0.350841,0.066954,-0.322641,-0.113375,0.172511,0.04238,-0.03262,0.533901
Garage_Area,0.097501,0.073417,0.184903,0.164837,-0.268515,0.373576,0.404602,0.342602,1.0,0.889866,...,0.092979,-0.097255,0.369489,0.186447,-0.394706,-0.154219,0.1326,0.138137,-0.114515,0.640138
Garage_Cars,0.119759,0.091135,0.162644,0.18008,-0.274966,0.338247,0.445729,0.382874,0.889866,1.0,...,0.04386,-0.069381,0.407712,0.206895,-0.457488,-0.166025,0.21525,0.140067,-0.097494,0.647562


In [29]:
display_corr_matrix(sort_alphabetically(df_orig[subset_2], 'Sale_Price'), label='Sale_Price')

<IPython.core.display.Javascript object>

Unnamed: 0,Age,BC_Bsmt_Unf_SF,BC_External_SF,Baths,Bedroom_Liv_Area_Ratio,Bsmt_Eval,Bsmt_Unf_SF,Central_Air,Exter_Qual,Fireplace_Eval,...,Year_Sold,bldg_type_1,bldg_type_3,garage_type_1,hs_style_1,neighborhoods_1,neighborhoods_3,neighborhoods_4,sale_cond_1,Sale_Price
Age,1.0,-0.073015,-0.140445,-0.536645,-0.276171,-0.5175,-0.13071,0.371259,-0.604606,-0.302604,...,0.056653,0.124583,-0.036578,-0.59129,-0.357988,0.646797,-0.340742,-0.222511,0.142508,-0.55891
BC_Bsmt_Unf_SF,-0.073015,1.0,0.068291,-0.01075,0.04067,0.257757,0.656611,-0.094314,0.194429,0.134858,...,-0.010492,0.143899,-0.162531,0.057262,0.13492,-0.136714,0.096655,0.005947,-0.054631,0.162569
BC_External_SF,-0.140445,0.068291,1.0,0.31378,0.274534,0.322493,0.034239,-0.101956,0.275057,0.343987,...,-0.008568,0.122145,-0.2068,0.247151,0.058236,-0.20243,0.08535,0.065263,-0.010134,0.390039
Baths,-0.536645,-0.01075,0.31378,1.0,0.348439,0.491904,-0.097314,-0.213567,0.471762,0.429533,...,0.02338,-0.119155,0.088892,0.406584,0.165438,-0.445783,0.235687,0.123179,-0.075232,0.636175
Bedroom_Liv_Area_Ratio,-0.276171,0.04067,0.274534,0.348439,1.0,0.482323,0.043578,-0.087163,0.454429,0.40741,...,0.000526,-0.134573,-0.044272,0.259575,0.09499,-0.311361,0.110952,0.057968,-0.111472,0.52294
Bsmt_Eval,-0.5175,0.257757,0.322493,0.491904,0.482323,1.0,0.331419,-0.226163,0.581395,0.417893,...,-0.019188,0.019089,-0.092617,0.407795,0.216938,-0.407875,0.070333,0.151872,-0.16939,0.732459
Bsmt_Unf_SF,-0.13071,0.656611,0.034239,-0.097314,0.043578,0.331419,1.0,-0.026795,0.247466,0.146175,...,-0.036443,0.052899,-0.040101,0.061908,0.119816,-0.148801,0.104836,-0.003453,-0.150107,0.183308
Central_Air,0.371259,-0.094314,-0.101956,-0.213567,-0.087163,-0.226163,-0.026795,1.0,-0.219271,-0.165186,...,0.003189,-0.069512,0.058003,-0.317708,-0.077234,0.245098,-0.122906,-0.104111,-0.030158,-0.264506
Exter_Qual,-0.604606,0.194429,0.275057,0.471762,0.454429,0.581395,0.247466,-0.219271,1.0,0.421185,...,-0.001894,-0.030665,-0.108806,0.389864,0.258447,-0.497885,0.177615,0.130393,-0.171135,0.69797
Fireplace_Eval,-0.302604,0.134858,0.343987,0.429533,0.40741,0.417893,0.146175,-0.165186,0.421185,1.0,...,-0.040997,0.135716,-0.175067,0.366132,0.082012,-0.360492,0.176445,0.017757,-0.048646,0.624509


The results obtained are very satisfying!

---

### Save the dataset

I decided to keep only three datasets to test my hypothesis.

In [30]:
df[subset_1].to_csv(os.path.join(RESOURCES_DIR, "ames_housing_out_21.csv"))
df[subset_2].to_csv(os.path.join(RESOURCES_DIR, "ames_housing_out_22.csv"))

df_orig[subset_2].to_csv(os.path.join(RESOURCES_DIR, "ames_housing_out_22_orig.csv"))