In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns # heatmaps yay

from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def drop_high_missing_features(df):
    tot_rec = len(df.index)
    for col in df.columns.values:
        if df[col].isnull().sum() / tot_rec > 0.15:
            del df[col]
    return df

def process_df(df, ohe_list):
    df['CentralAir'] = df['CentralAir'].map({'Y':1,'N':0}).astype(int)
    
    df = drop_high_missing_features(df)
    
    # Drop TotRmsAbvGrd - Same type of information as GrLivArea
    del df['TotRmsAbvGrd']
    # Same for GarageCars (GarageArea)
    del df['GarageCars']

    if 'SalePrice' in df.columns.values:
        df['SalePrice'] = np.log(df['SalePrice'])
    return df

#### Import the train csv file to take a look at the data

In [None]:
df = pd.read_csv(os.path.join('data','train.csv'))

In [None]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
df.head()

How many records are we dealing with?

In [None]:
len(df.index)

#### Determine which features are important

How much of each feature is null?

In [None]:
df_nulls = df.copy().isnull().sum()
df_nulls = df_nulls.to_frame().rename(columns={0:'num_nulls'})
df_nulls['total_records'] = len(df.index)
df_nulls['pct_null'] = df_nulls['num_nulls'] / df_nulls['total_records'] * 100
df_nulls[df_nulls['num_nulls'] > 0].sort_values(by='pct_null', ascending=False)

There are several features which have a high null rate. If we use 15% as a cutoff to say we don't want to use these features we can just drop PoolQC, MiscFeature, Alley, Fence, FireplaceQu and LotFrontage. This is taken care in the `drop_high_missing_features` function written above.

In [None]:
df = drop_high_missing_features(df)

Another interesting note is that the GarageX type features all have the same amount of null values. This is likely because they're a part of the same records. We can verify this to see how many records have all those features set to null.

In [None]:
len(df[df['GarageType'].isnull() &
       df['GarageYrBlt'].isnull() &
       df['GarageFinish'].isnull() &
       df['GarageQual'].isnull() &
       df['GarageCond'].isnull()])

It's confirmed. All 81 are a part of the same record.



Next let's take a look at the correlations between each feature.

In [None]:
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(df.corr(), vmax=0.8, square=True, ax=ax)

Looking at the correlation heat map we can quickly identify features which are highly related to other features. For example 1stFlSF and TotalBsmntSF are highly correlated which makes sense because generally speaking most basements are full basements (the basement is wearing the first floor like a hat). The same goes for 1stFlSF and 2ndFlSF (This house is a mad hatter).

Another instance is GarageCars and GarageArea. This makes sense because you need more space to store more cars. This is the same rationale for TotRmsAbvGrd (Total rooms above ground) and GrLivArea (ground floor living area).

YearBuilt and GarageYrBlt also appear to be highly correlated. This makes sense because typically the house and the garage are built at the same time.