In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns # heatmaps yay

from datetime import datetime

from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def drop_high_missing_features(df):
    tot_rec = len(df.index)
    for col in df.columns.values:
        if df[col].isnull().sum() / tot_rec > 0.15:
            del df[col]

def process_df(df):
    df['CentralAir'] = df['CentralAir'].map({'Y':1,'N':0}).astype(int)
    
    drop_high_missing_features(df)
        
    # Drop TotRmsAbvGrd - Same type of information as GrLivArea
    del df['TotRmsAbvGrd']
    # Same for GarageCars (GarageArea)
    
    
    for c in ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
              'GarageQual', 'GarageCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtUnfSF',
              'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'BsmtFinSF1', 'BsmtFinSF2', 
              'MasVnrArea', 'MasVnrType']:
        if c in df.columns.values:
            del df[c]
        
    log_list = ['LotArea', '1stFlrSF', '2ndFlrSF', 'GrLivArea']
    square_root_list = ['TotalBsmtSF', 'GarageArea']

    bct_list = log_list + square_root_list
    
    #for l in log_list:
    #    df[l] = np.log(df[l])

    #for s in square_root_list:
    #    df[s] = np.sqrt(df[s])
    
    for f in bct_list:
        df[f], _ = boxcox(df[f])

    # Sale Price is in our training data, but not testing data
    if 'SalePrice' in df.columns.values:
        df['SalePrice'] = np.log(df['SalePrice'])
        filter = (df['GrLivArea'] > 4000) & (df['SalePrice'] < 200000)
        df = df[~filter]
        
    
    mask = df['Electrical'].notna()
    df = df.loc[(mask), :]

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title = 'Confusion Matrix',
                          cmap = plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print('Normalized Confusion Matrix')
    else:
        print('Confusion Matrix Without Normalization')
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    for i,j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i,j], fmt),
                horizontalalignment="center",
                color="white" if cm[i,j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_text_index(df, name):
    le = LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

In [None]:
from sklearn.metrics import roc_curve, auc, confusion_matrix

def show_results(clf, X, y):
    acc_score = clf.score(X, y)
    print('Accuracy: {}'.format(acc_score*100))
    
    y_pred = clf.predict(X)
    cm_matrix = confusion_matrix(y, y_pred)
    np.set_printoptions(precision=2)
    
    Class = encode_text_index(pd.DataFrame(y), y.columns[0])
    
    fig = plt.figure()
    plot_confusion_matrix(cm_matrix, classes=Class, normalize=True)
    plt.show()
    
    y_pred = clf.predict_proba(X)
    fpr, tpr, thresholds = roc_curve(y, y_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig = plt.figure()
    lw = 2
    
    plt.plot(fpr, tpr, color='red', lw=lw, label='ROC Curve (area = {:%0.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right', frameon=False)
    
    plt.show()
    

#### Import the train csv file to take a look at the data

In [None]:
df = pd.read_csv(os.path.join('data','train.csv'))

In [None]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
df.head()

How many records are we dealing with?

In [None]:
len(df.index)

#### Determine which features are important

How much of each feature is null?

In [None]:
df_nulls = df.copy().isnull().sum()
df_nulls = df_nulls.to_frame().rename(columns={0:'num_nulls'})
df_nulls['total_records'] = len(df.index)
df_nulls['pct_null'] = df_nulls['num_nulls'] / df_nulls['total_records'] * 100
df_nulls[df_nulls['num_nulls'] > 0].sort_values(by='pct_null', ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df.isnull(), cbar=False, ax=ax)

There are several features which have a high null rate. If we use 15% as a cutoff to say we don't want to use these features we can just drop PoolQC, MiscFeature, Alley, Fence, FireplaceQu and LotFrontage. This is taken care in the `drop_high_missing_features` function written above.

In [None]:
drop_high_missing_features(df)

Let's also drop the one record where Electical is null. This won't adversely affect the overall model since it's a single record. This will be done in the `process_df` function.

Another interesting note is that the GarageX type features all have the same amount of null values. This is likely because they're a part of the same records. We can verify this to see how many records have all those features set to null.

In [None]:
len(df[df['GarageType'].isnull() &
       df['GarageYrBlt'].isnull() &
       df['GarageFinish'].isnull() &
       df['GarageQual'].isnull() &
       df['GarageCond'].isnull()])

It's confirmed. All 81 are a part of the same record. We won't need to worry about these though as they are being dropped since they are reduntant (for example GarageYrBlt is nearly the same as YrBuilt or GarageQual is a similar feature to OverallQual). A case could be made to keep these features but for now they will be dropped.

Next let's take a look at the correlations between each feature.

In [None]:
plt.subplots(figsize = (30,20))
sns.heatmap(df.corr(), cmap="YlGnBu", annot=True);
plt.title("Heatmap of Feature Correlation", fontsize = 30);

Looking at the correlation heat map we can quickly identify features which are highly related to other features. For example 1stFlSF and TotalBsmntSF are highly correlated which makes sense because generally speaking most basements are full basements (the basement is wearing the first floor like a hat). The same goes for 1stFlSF and 2ndFlSF (This house is a mad hatter).

Another instance is GarageCars and GarageArea. This makes sense because you need more space to store more cars. This is the same rationale for TotRmsAbvGrd (Total rooms above ground) and GrLivArea (ground floor living area).

YearBuilt and GarageYrBlt also appear to be highly correlated. This makes sense because typically the house and the garage are built at the same time.

Let's take some time to graph some of these relationships and see what comes up.

In [None]:
plt.subplots(figsize = (20,12))
sns.scatterplot(x=df['GrLivArea'], y= df['SalePrice']);

In [None]:
filter = (df['GrLivArea'] > 4000) & (df['SalePrice'] < 200000)
df[filter][['GrLivArea', 'SalePrice']]

As we can see from the graph there are four possible outliers, two with a sale price over $700,000 and two with over 4500 sq. ft. but a much lower price. If we follow the trend of the graph the higher two outliers seem to fit the pattern and we can likely keep these two in however we should probably remove the other two outliers. As this is unique to the training dataset we'll do that below.

In [None]:
df = df[~filter]

In [None]:
mask = df['Electrical'].notna()
df = df.loc[(mask), :]

In [None]:
sns.scatterplot(np.log(df['GrLivArea']), np.log(df['SalePrice']));

The Box-Cox test quickly and easily tells us if  we need to perform a transform on our data or not by telling us a lambda value:

* -1. is a reciprocal
* -.5 is a recriprocal square root
* 0.0 is a log transformation
* .5 is a square root transform and
* 1.0 is no transform.

In [None]:
bc_list = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'SalePrice']

for c in bc_list:
    xt, maxlog = boxcox(df[c] + 0.000001)
    fig = plt.figure()
    sns.scatterplot(df[c], xt, alpha=0.5);

    print("{} lambda = {:g}".format(c, maxlog))
    
    #plt.show_legend()

Based on the Box-Cox test we ran on the columns from our dataset, we need to perform the following transformations in our `process_df` function:

- Log
  - LotArea
  - 1stFlrSF
  - 2ndFlrSF
  - SalePrice
  - GrLivArea

- Square root
  - TotalBsmtSF
  - GarageArea
  

Alternatively we can just use the transformed data (`xt` above) returned by the boxcox function.

In [None]:
df['CentralAir'].isnull().sum()

In [None]:
df = pd.read_csv(os.path.join('data','train.csv'))
process_df(df)

In [None]:
df.head()

# TODO
Apply lamba values obtained from the boxcox function from the training data set to the testing dataset:
```
    eps = 1.0  # shift features away from zero
    for i in range(n_feats):
        x_train_boxcox[:, i], lmbda_opt[i] = boxcox(x_train[:, i] + eps)

    x_test_boxcox = np.zeros(x_test.shape)
    for i in range(n_feats):
        x_test_boxcox[:, i] = boxcox(x_test[:, i] + eps, lmbda=lmbda_opt[i])
```