In [2]:
# Read in data
train_raw = pd.read_csv("data/train.csv")
test_raw = pd.read_csv("data/test.csv")
train = train_raw
train_raw.columns
#test.info()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Generic Data Cleaning

In [1]:
# Libraries
import pandas as pd 
import numpy as np 

# sklearn transformers
from sklearn.preprocessing \
    import StandardScaler, SplineTransformer, PowerTransformer, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import ElasticNetCV

In [7]:
# Converts all object (string) columns to 
# be categorical.
# @param: train - data in its raw form
# @return: a pandas data frame with objects coded as categorical 
def to_cat(train):
    train[train.select_dtypes(['object']).columns] = (
        train.select_dtypes(['object'])
        .apply(lambda x: x.astype('category'))
    )
    return train

# Codes ad-hoc features to be categorical if they appear numeric
# in the raw format. 
# @param: train - data in its raw form
# @return: a pandas data frame with some columns marked as categorical.
def some_num_to_cat(train): 
    train['MSSubClass'] = train['MSSubClass'].astype('category')
    train['YearBuilt'] = train['YearBuilt'].astype('category')
    train['YearRemodAdd'] = train['YearRemodAdd'].astype('category')
    train['GarageYrBlt'] = train['GarageYrBlt'].astype('category')
    train['MoSold'] = train['MoSold'].astype('category')
    train['YrSold'] = train['YrSold'].astype('category')

    return train

# Engineering pre-spec features
# @param: train - data in its raw form 
# @return: a pandas data frame with added features
def feat_eng(train): 
    # Feature Engineering
    # NewGarage
    train['NewGarage'] = (
        np.where(train['GarageYrBlt'].isnull(), 0, 
            np.where(train['GarageYrBlt'] > train['YearBuilt'], 1, 0))
    )
    train['NewGarage'] = train['NewGarage'].astype('category')

    # YearSinceRmdl
    train['YearSinceRmdl'] = 2016 - train['YearRemodAdd']

    # Rmdl
    train['Rmdl'] = np.where(
            train['YearBuilt'] < train['YearRemodAdd'], 1, 0)
    train['Rmdl'] = train['Rmdl'].astype('category')

    # TotalPorchArea
    train['TotalPorchArea'] = (
        train['WoodDeckSF'] + train['OpenPorchSF'] + 
        train['EnclosedPorch'] + train['3SsnPorch'] + 
        train['ScreenPorch']
    )

    #PorchYes
    train['PorchYes'] = np.where(train['TotalPorchArea'] > 0, 1, 0)
    train['PorchYes'] = train['PorchYes'].astype('category')

    # TotalFinishedBsmt
    train['TotalFinishedBsmt'] = train['BsmtFinSF1'] + train['BsmtFinSF2']

    # PercentFinishedBsmt
    train['PercentFinishedBsmt'] = np.where(train['TotalBsmtSF'] > 0, 
        train['TotalFinishedBsmt'] / train['TotalBsmtSF'] * 100, 0)

    # TotalSqFt
    train['TotalSqFt'] = train['GrLivArea'] + train['TotalFinishedBsmt']

    # PercentLowQual
    train['PercentLowQual'] = train['LowQualFinSF'] * 100 / train['TotalSqFt']

    # IsNew
    train['IsNew'] = np.where(
        train['YrSold'] == train['YearRemodAdd'], 1, 0)
    train['IsNew'] = train['IsNew'].astype('category')

    # House_Age
    train['House_age'] = train['YrSold'] - train['YearRemodAdd']

    # NeighRich
    train['NeighRich'] = np.select(
        condlist = [
            train['Neighborhood'] == ('StoneBr' or 'NridgHt' or 'NoRidge'), 
            train['Neighborhood'] == ('MeadowV' or 'IDOTRR' or 'BrDale')
        ], 
        choicelist = [2, 0],
        default = 1
    )
    train['NeighRich'] = train['NeighRich'].astype('category')
    
    return train

# A helper function that converts a column to an ordinal scale.
# Scale was determined ad-hoc.
# @param: train - data in its raw form 
# @param: col_name - a string name of the column to be converted
def ord_scale_1(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "Ex", 
            train[col_name] == "Gd", 
            train[col_name] == "TA", 
            train[col_name] == "Fa", 
            train[col_name] == "Po"
        ], 
        choicelist = [5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

def ord_scale_2(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "GLQ", 
            train[col_name] == "ALQ", 
            train[col_name] == "BLQ", 
            train[col_name] == "REC", 
            train[col_name] == "LwQ", 
            train[col_name] == "Unf", 
        ], 
        choicelist = [6, 5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

def ord_encode(train): 
    # Ordinal Scale 1
    cols_scale_1 = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 
                    'BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 
                    'GarageCond', 'PoolQC']
    
    for i in cols_scale_1:
        train[i] = ord_scale_1(train, i)

    # Ordinal Scale 2 
    train['BsmtFinType1'] = ord_scale_2(train, 'BsmtFinType1')
    train['BsmtFinType2'] = ord_scale_2(train, 'BsmtFinType2')

    # Ad-hoc ordeal scales 
    train['LotShape'] = np.select(
        condlist = [
            train['LotShape'] == "Reg", 
            train['LotShape'] == "IR1", 
            train['LotShape'] == "IR2", 
            train['LotShape'] == "IR3" 
        ], 
        choicelist = [3, 2, 1, 0]
    )

    train['LandSlope'] = np.select(
        condlist = [
            train['LandSlope'] == "Gtl", 
            train['LandSlope'] == "Mod", 
            train['LandSlope'] == "Sev"
        ], 
        choicelist = [2, 1, 0]
    )

    train['BsmtExposure'] = np.select(
        condlist = [
            train['BsmtExposure'] == "Gd", 
            train['BsmtExposure'] == "Av", 
            train['BsmtExposure'] == "Mn", 
            train['BsmtExposure'] == "No"
        ], 
        choicelist = [4, 3, 2, 1], 
        default = 0
    )

    train['GarageFinish'] = np.select(
        condlist = [
            train['GarageFinish'] == "Fin", 
            train['GarageFinish'] == "RFn", 
            train['GarageFinish'] == "Unf", 
        ], 
        choicelist = [3, 2, 1], 
        default = 0
    )

    train['Functional'] = np.select(
        condlist = [
            train['Functional'] == "Typ", 
            train['Functional'] == "Min1", 
            train['Functional'] == "Min2", 
            train['Functional'] == "Mod", 
            train['Functional'] == "Maj1", 
            train['Functional'] == "Maj2", 
            train['Functional'] == "Sev", 
            train['Functional'] == "Sal" 
        ], 
        choicelist = [7, 6, 5, 4, 3, 2, 1, 0]
    )

    return train

def knn_Impute(train, numeric_cols, cat_cols, neighbors = 5, 
                reverse_scale = True, reverse_dummy = True):
    # Scale the numeric columns
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(scaled_values, columns = numeric_cols))

    # Dummy the categorical columns 
    dummy = OneHotEncoder(drop = 'first')
    dummy_values = dummy.fit_transform(train[cat_cols]).toarray()
    dummy_names = dummy.get_feature_names_out().tolist()
    train = train.drop(cat_cols, axis = 1)
    train = train.join(pd.DataFrame(dummy_values, columns = dummy_names))

    # Knn imputation
    imputer = KNNImputer(n_neighbors = neighbors)
    train = pd.DataFrame(imputer.fit_transform(train), columns = train.columns)

    #print(train.head())

    # Reverse scaling
    if reverse_scale: 
        no_scale_values = scaler.inverse_transform(train[numeric_cols])
        train = train.drop(numeric_cols, axis = 1)
        train = train.join(pd.DataFrame(no_scale_values, columns = numeric_cols))

    # Reverse dummies
    if reverse_dummy: 
        no_dummy_values = dummy.inverse_transform(train[dummy_names]) 
        train = train.drop(dummy_names, axis = 1)
        train = train.join(pd.DataFrame(no_dummy_values, columns = cat_cols))

    # Reversal of dummy makes them objects again
    return to_cat(train)

def dummy_cols(train, cat_cols, drop_first = True): 
    if drop_first:
        dummy = OneHotEncoder(drop = 'first')
    else:
        dummy = OneHotEncoder()
    dummy_values = dummy.fit_transform(train[cat_cols]).toarray()
    dummy_names = dummy.get_feature_names_out().tolist()
    train = train.drop(cat_cols, axis = 1)
    train = train.join(pd.DataFrame(dummy_values, columns = dummy_names))

    return train

def drop_nzv(train, threshold = 0.05): 
    selector = VarianceThreshold(threshold = threshold)
    train = train.loc[:, selector.fit(train).get_support()]

    return train

def yeo_johnson(train, numeric_cols, standardize = False):
    yj = PowerTransformer(standardize = standardize)
    yj_values = yj.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(yj_values, columns = numeric_cols))

    return train

def standardize(train, numeric_cols):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(scaled_values, columns = numeric_cols))

    return train

def add_ns_3(train, cols, degree = 3, knots = 2): 
    spliner = SplineTransformer(degree = degree, n_knots = knots, include_bias = False)

    for i in cols:
        x = train[i].values.reshape(-1, 1)
        new_col_names = [(i + "_ns" + str(j)) for j in range(1, degree + 1)]
        spline = pd.DataFrame(spliner.fit_transform(x), columns = new_col_names)
        train = train.join(spline)
        train = train.drop(i, axis = 1)
    
    return train

def drop_high_cor(df, threshold = 0.9):
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    # Drop features 
    return df.drop(to_drop, axis=1)

# Penalized Regression

## Data Cleaning

In [None]:
def recipe_pLR(train, verbose = True, rep = True): 
    if verbose: print('Preprocessing Steps: ')
    # Drop Id
    if 'Id' in train: 
        train_Id = train['Id'] 
        train = train.drop('Id', axis = 1)

    # Drop Response
    if 'SalePrice' in train: 
        train_rep = train['SalePrice']
        train = train.drop('SalePrice', axis = 1)

    # Add user features
    train = feat_eng(train)
    if verbose: print("1. Added user engineered features")

    # Marks columns as categorical 
    train = to_cat(train)
    train = some_num_to_cat(train)
    if verbose: print('2. Encoded user specified variables as categorical')

    # Ordinarily encodes select variables
    train = ord_encode(train)
    if verbose: print("3. Encoded user specified variables to be ordinal")

    # Track which variables are numeric and categorical 
    numeric_cols = train.select_dtypes(include = np.number).columns
    cat_cols = train.select_dtypes('category').columns

    # Imputes missing values
    train = knn_Impute(train, numeric_cols, cat_cols, reverse_dummy = True)
    if verbose: print("4. Imputed missing values using knn with k = 5")

    # Create dummy variable
    train = dummy_cols(train, cat_cols)
    if verbose: print("5. Categorical columns were convert into n - 1 binary dummy variables")

    # Yeo-Johnson on Numerics
    train = yeo_johnson(train, numeric_cols)
    if verbose: print('6. Yeo-Johnson Transformation of numeric columns')

    # Standardized
    train = standardize(train, numeric_cols)
    if verbose: print('7. Numeric columns scaled to mean 0 and unit variance')

    # Splines
    train = add_ns_3(train, cols = numeric_cols)
    if verbose: print('8. Numeric features transformed into natural cubic splines')

    # Add log Price back in 
    if rep: 
        train.insert(loc = 0, column = 'SalePrice', value = np.log(train_rep))
        if verbose: print("9. Log transformation of response")

    # Add Ids back in
    train.insert(loc = 0, column = 'Id', value = train_Id)

    return(train)

In [None]:
train_pLR = recipe_pLR(train_raw)

## Model Fitting

In [None]:
# Run penalized regression model 
penalty_type = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]
Y = train_pLR['SalePrice']
X = train_pLR.drop(['SalePrice','Id'], axis = 1)
pLR_model = ElasticNetCV(l1_ratio = penalty_type, cv = 10, 
    verbose = 1, random_state = 123) #123 is the seed 
pLR_fit = pLR_model.fit(X, Y)

In [None]:
# Extract hyper-parameters (Lasso with small penalty)
optimal_type = pLR_model.l1_ratio_
optimal_pen = pLR_model.alpha_

#Extract Coefficients
coef = pd.DataFrame(pLR_model.coef_, index = X.columns, columns = ["Beta"])
coef = coef[coef['Beta'] > 0]
coef.head()

## Model Evaluation

In [None]:
# Training RMSE
from sklearn.metrics import mean_squared_error
pLR_train_pred = np.exp(pLR_model.predict(X))
pLR_train_rmse = mean_squared_error(np.exp(Y), pLR_train_pred, squared = False)
print(pLR_train_rmse)

# Make predictions on testing data

# Cleaning Testing data
test_pLR = recipe_pLR(test_raw, rep = False)

# Match training a testing columns
# Match Drops 
train_drops = np.setdiff1d(test_pLR.columns, train_pLR.columns)
train_drop = train_drops.tolist()
X_test = test_pLR.drop(train_drops, axis = 1)
# Add 0's for missing factor levels 
mis_levels = np.setdiff1d(X.columns, X_test.columns)
mis_levels.tolist()
X_test[mis_levels] = 0
X_test = X_test.drop(['Id'], axis = 1)
# Match feature orders
X_test = X_test.reindex(X.columns, axis = 1)

# Apply model 
pLR_predictions = np.exp(pLR_model.predict(X_test))

#Submission Format
kaggle_pLR = pd.DataFrame()
kaggle_pLR['Id'] = test_raw['Id']
kaggle_pLR['SalePrice'] = pLR_predictions

kaggle_pLR.to_csv("pLR_Kaggle_Submission_final.csv", index = False)

# Random Forest

## Data Cleaning

In [None]:
def recipe_RF(train, verbose = True, rep = True):
    if verbose: print('Preprocessing Steps: ')
    # Drop Id
    if 'Id' in train: 
        train_Id = train['Id'] 
        train = train.drop('Id', axis = 1)

    # Drop Response
    if 'SalePrice' in train: 
        train_rep = train['SalePrice']
        train = train.drop('SalePrice', axis = 1)

    # Add user features
    train = feat_eng(train)
    if verbose: print("1. Added user engineered features")

    # Marks columns as categorical 
    train = to_cat(train)
    train = some_num_to_cat(train)
    if verbose: print('2. Encoded user specified variables as categorical')

    # Ordinarily encodes select variables
    train = ord_encode(train)
    if verbose: print("3. Encoded user specified variables to be ordinal")

    # Track which variables are numeric and categorical 
    numeric_cols = train.select_dtypes(include = np.number).columns
    cat_cols = train.select_dtypes('category').columns

    # Imputes missing values
    train = knn_Impute(train, numeric_cols, cat_cols, reverse_dummy = True)
    if verbose: print("4. Imputed missing values using knn with k = 5")

    # Create dummy variable
    train = dummy_cols(train, cat_cols, drop_first = False)
    if verbose: print("5. Categorical columns were convert into n binary dummy variables")

    # Standardized
    train = standardize(train, numeric_cols)
    if verbose: print('6. Numeric columns scaled to mean 0 and unit variance')

    # Add Price back in 
    if rep: 
        train.insert(loc = 0, column = 'SalePrice', value = train_rep)

    # Add Ids back in
    train.insert(loc = 0, column = 'Id', value = train_Id)

    return(train)

In [None]:
train_RF = recipe_RF(train_raw)

## Model Fitting

In [None]:
# Import random forest package.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Creates a random grid of tuning parameters for the RF model.

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Fit 100 random combinations of parameters from the grid. 
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
    n_iter = 50, cv = 5, verbose = 2, random_state = 123, n_jobs = 30) 
        # Note: n_jobs indicates how many instances to run in parallel. 
        # Error occurs when trying to use over 30 cores.

In [None]:
# Fit the random models
Y = train_RF['SalePrice']
X = train_RF.drop(['SalePrice','Id'], axis = 1)
rf_random.fit(X, Y)

In [None]:
# Best random model parameters
rf_random.best_params_

In [None]:
# Creates a parameter grid for a grid search based on the random search results.
param_grid = {'n_estimators': [1200, 1300, 1400, 1500, 1600],
               'max_features': ['sqrt'],
               'max_depth': [40, 45, 50, 55, 60],
               'min_samples_split': [2, 3, 4, 5, 6, 7],
               'min_samples_leaf': [1],
               'bootstrap': [False]}

from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, 
    cv = 5, n_jobs = 30, verbose = 2, random_state = 123)

In [None]:
# Fit models based on grid search 
rf_grid.fit(X, Y)

In [None]:
rf_grid.best_params_

## Model Evaluation

In [None]:
# Extract Best Model
rf_optimal = rf_grid.best_estimator_

# Training RMSE
from sklearn.metrics import mean_squared_error
rf_train_pred = rf_optimal.predict(X)
rf_train_rmse = mean_squared_error(Y, rf_train_pred, squared = False)
print(rf_train_rmse)

# Make predictions on testing data

# Cleaning Testing data
test_rf = recipe_RF(test_raw, rep = False)

train_rf = recipe_RF(train_raw)

# Match training a testing columns
# Match Drops 
train_drops = np.setdiff1d(test_rf.columns, train_rf.columns)
train_drop = train_drops.tolist()
X_test = test_rf.drop(train_drops, axis = 1)
# Add 0's for missing factor levels 
mis_levels = np.setdiff1d(X.columns, X_test.columns)
mis_levels.tolist()
X_test[mis_levels] = 0
X_test = X_test.drop(['Id'], axis = 1)
# Match feature orders
X_test = X_test.reindex(X.columns, axis = 1)

# Apply model 
rf_test_pred = rf_optimal.predict(X_test)

#Submission Format
kaggle_rf = pd.DataFrame()
kaggle_rf['Id'] = test_raw['Id']
kaggle_rf['SalePrice'] = rf_test_pred

kaggle_rf.to_csv("rf_Kaggle_Submission_final.csv", index = False)

# Gradient Boosting

## Data Cleaning

In [None]:
# Uses the same data cleaning as the RF model.
recipe_XGB = recipe_RF
train_XGB = recipe_XGB(train_raw)

Y = train_XGB['SalePrice']
X = train_XGB.drop(['SalePrice','Id'], axis = 1)

## Model Fitting

In [None]:
# XGB Packages
from sklearn.ensemble import GradientBoostingRegressor
xgb = GradientBoostingRegressor()

In [None]:
# Creates a random grid of tuning parameters for the XBG model.

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Learning rate of the weak learner
learning_rate = [0.001, 0.01, 0.01]

# Type of loss function to optimize
loss = ['squared_error', 'absolute_error']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'learning_rate': learning_rate,
               'loss': loss}

print(random_grid)

In [None]:
# Fits 250 model based on random samples from the above grid.
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, 
    n_iter = 50, cv = 5, verbose = 2, random_state = 123, n_jobs = 30) 

xgb_random.fit(X, Y)

In [None]:
# Optimal Parameters based on a random search
xgb_random.best_params_

# Define a new grid to conduct an exhaustive search 
xgb_grid_search = {'n_estimators': [700, 800, 900],
                   'max_depth': [55, 60, 65],
                   'min_samples_split': [1, 2, 3],
                   'learning_rate': [0.005, 0.01, 0.015],
                   'loss': ['absolute_error']}

xgb_grid = GridSearchCV(estimator = xgb, param_grid = xgb_grid_search, 
    cv = 5, n_jobs = 30, verbose = 2, random_state = 123)

In [None]:
xgb_grid.fit(X, Y)

In [None]:
xgb_grid.best_params_

## Model Evaluation

In [None]:
# Extract Best Model
xgb_optimal = xgb_grid.best_estimator_

# Training RMSE
xgb_train_pred = xgb_optimal.predict(X)
xgb_train_rmse = mean_squared_error(Y, xgb_train_pred, squared = False)
print(xgb_train_rmse)

# Make predictions on testing data

# Cleaning Testing data
test_xgb = recipe_XGB(test_raw, rep = False)

train_xgb = recipe_XGB(train_raw)

# Match training a testing columns
# Match Drops 
train_drops = np.setdiff1d(test_xgb.columns, train_xgb.columns)
train_drop = train_drops.tolist()
X_test = test_xgb.drop(train_drops, axis = 1)
# Add 0's for missing factor levels 
mis_levels = np.setdiff1d(X.columns, X_test.columns)
mis_levels.tolist()
X_test[mis_levels] = 0
X_test = X_test.drop(['Id'], axis = 1)
# Match feature orders
X_test = X_test.reindex(X.columns, axis = 1)

# Apply model 
xgb_test_pred = xgb_optimal.predict(X_test)

#Submission Format
kaggle_xgb = pd.DataFrame()
kaggle_xgb['Id'] = test_raw['Id']
kaggle_xgb['SalePrice'] = xgb_test_pred

kaggle_xgb.to_csv("xgb_Kaggle_Submission_final.csv", index = False)

# Neutral Network

In [3]:
# Neutral Network Packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LeakyReLU
import keras_tuner

## Data Cleaning

In [8]:
# Data cleaning
# Defines data cleaning steps for the NN models
# No Feature engineering 
# No adjustments for skewness 
# No splines
# No log Response
def recipe_NN(train, verbose = True, rep = True): 
    if verbose: print('Preprocessing Steps: ')
    # Drop Id
    if 'Id' in train: 
        train_Id = train['Id'] 
        train = train.drop('Id', axis = 1)

    # Drop Response
    if 'SalePrice' in train: 
        train_rep = train['SalePrice']
        train = train.drop('SalePrice', axis = 1)

    # Marks columns as categorical 
    train = to_cat(train)
    train = some_num_to_cat(train)
    if verbose: print('1. Encoded user specified variables as categorical')

    # Ordinarily encodes select variables
    train = ord_encode(train)
    if verbose: print("2. Encoded user specified variables to be ordinal")

    # Track which variables are numeric and categorical 
    numeric_cols = train.select_dtypes(include = np.number).columns
    cat_cols = train.select_dtypes('category').columns

    # Imputes missing values
    train = knn_Impute(train, numeric_cols, cat_cols, reverse_dummy = True)
    if verbose: print("3. Imputed missing values using knn with k = 5")

    # Create dummy variable
    train = dummy_cols(train, cat_cols, drop_first = False)
    if verbose: print("4. Categorical columns were convert into n binary dummy variables")

    # Standardized
    train = standardize(train, numeric_cols)
    if verbose: print('5. Numeric columns scaled to mean 0 and unit variance')

    # Add log Price back in 
    if rep: 
        train.insert(loc = 0, column = 'SalePrice', value = train_rep)

    # Add Ids back in
    train.insert(loc = 0, column = 'Id', value = train_Id)

    return(train)

In [9]:
# Apply cleaning recipe
train_NN_clean = recipe_NN(train_raw)
X = train_NN_clean.drop(['Id', 'SalePrice'], axis = 1)
Y = train_NN_clean['SalePrice']

Preprocessing Steps: 
1. Encoded user specified variables as categorical
2. Encoded user specified variables to be ordinal
3. Imputed missing values using knn with k = 5
4. Categorical columns were convert into n binary dummy variables
5. Numeric columns scaled to mean 0 and unit variance


## Model Fitting

## Hyperparameter Tunning

In [24]:
def build_model(hp):
    model = keras.Sequential()
    # Input Layer
    model.add(keras.Input(shape = (531, )))

    # Tune the number of hidden layers
    for i in range(hp.Int("num_layers", min_value = 1, max_value = 3, step = 1)):
        model.add(
            # Tune the number of nodes in each layer
            layers.Dense(
                units = hp.Int(f"units{i}", min_value = 5, max_value = 30, step = 5), 
                # Tune activation function between layers
                activation = hp.Choice("activation", ["relu", "tanh", "LeakyReLU", "linear"]),
                    #Note: I believe None = linear activation
                # Tune weight penalty
                kernel_regularizer = keras.regularizers.L1L2(
                    l1 = hp.Float("lasso", min_value = 0, max_value = 2, step = 0.1), 
                    l2 = hp.Float("ridge", min_value = 0, max_value = 2, step = 0.1)     
                ),
                # Tune bias penalty
                bias_regularizer = keras.regularizers.L1L2(
                    l1 = hp.Float("lasso", min_value = 0, max_value = 2, step = 0.1), 
                    l2 = hp.Float("ridge", min_value = 0, max_value = 2, step = 0.1) 
                ), 
            )
        )

    # Tune Dropout rate
    model.add(layers.Dropout(rate = hp.Float("Dropout", min_value = 0, max_value = 0.5, step = 0.1)))

    # Output Layer
    model.add(layers.Dense(1, activation = 'relu'))

    # Learning rate schedule with decay. 
    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
        boundaries = [500], values = [0.01, 0.001])

    # Model Compiler
    model.compile(
        loss = keras.losses.MeanSquaredError(), 

        # Tune optimization 
        # SGD caused nan RMSE
        optimizer = keras.optimizers.RMSprop(learning_rate = learning_rate_fn, 
            momentum = hp.Float("momentum", min_value = 0, max_value = 0.5, step = 0.1)
        ), 

        metrics = [keras.metrics.RootMeanSquaredError(name = 'rmse')]
    )

    return model 

build_model(keras_tuner.HyperParameters())

<keras.engine.sequential.Sequential at 0x26df9ad4430>

In [26]:
# Random Sample Grid of Hyperparameters

tuner = keras_tuner.RandomSearch(
    hypermodel = build_model, 
    max_trials = 5, 
    executions_per_trial = 2,
    objective = keras_tuner.Objective('rmse', direction = 'min'), 
    overwrite = True,
    seed = 123, 
    directory = "NN_Tuning_2",
    project_name = "test_rand_final_3"
)

tuner.search_space_summary()

Search space summary
Default search space size: 7
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 3, 'step': 1, 'sampling': None}
units0 (Int)
{'default': None, 'conditions': [], 'min_value': 5, 'max_value': 30, 'step': 5, 'sampling': None}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh', 'LeakyReLU', 'linear'], 'ordered': False}
lasso (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 2.0, 'step': 0.1, 'sampling': None}
ridge (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 2.0, 'step': 0.1, 'sampling': None}
Dropout (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': None}
momentum (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': None}


In [27]:
tuner.search(X, Y, epochs = 1000, verbose = 2)

Trial 5 Complete [00h 01m 40s]
rmse: 196503.4375

Best rmse So Far: 37657.7109375
Total elapsed time: 00h 07m 42s
INFO:tensorflow:Oracle triggered exit


In [28]:
# Extract Best Model
model_best = tuner.get_best_models(num_models = 1)
model_best[0].summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 15)                7980      
                                                                 
 dense_1 (Dense)             (None, 10)                160       
                                                                 
 dense_2 (Dense)             (None, 15)                165       
                                                                 
 dropout (Dropout)           (None, 15)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 16        
                                                                 
Total params: 8,321
Trainable params: 8,321
Non-trainable params: 0
_________________________________________________________________


In [29]:
tuner.results_summary()

Results summary
Results in NN_Tuning_2\test_rand_final_2
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x0000026DFA9A2DF0>
Trial summary
Hyperparameters:
num_layers: 3
units0: 15
activation: linear
lasso: 1.1
ridge: 0.7000000000000001
Dropout: 0.30000000000000004
momentum: 0.30000000000000004
units1: 10
units2: 15
Score: 37657.7109375
Trial summary
Hyperparameters:
num_layers: 1
units0: 20
activation: LeakyReLU
lasso: 0.6000000000000001
ridge: 1.7000000000000002
Dropout: 0.4
momentum: 0.5
units1: 5
units2: 20
Score: 43132.853515625
Trial summary
Hyperparameters:
num_layers: 3
units0: 25
activation: linear
lasso: 0.30000000000000004
ridge: 0.9
Dropout: 0.30000000000000004
momentum: 0.0
units1: 5
units2: 5
Score: 55659.349609375
Trial summary
Hyperparameters:
num_layers: 3
units0: 15
activation: linear
lasso: 1.7000000000000002
ridge: 1.4000000000000001
Dropout: 0.4
momentum: 0.30000000000000004
units1: 30
units2: 5
Score: 66783.90625
Trial summary
Hyperparamet

In [30]:
# Predicting on Test Data

# Cleaning Testing data for NN
test_NN = recipe_NN(test_raw, rep = False)

# Match training a testing columns
# Match Drops 
train_drops = np.setdiff1d(test_NN.columns, train_NN_clean.columns)
train_drop = train_drops.tolist()
X_test = test_NN.drop(train_drops, axis = 1)

# Add 0's for missing factor levels 
mis_levels = np.setdiff1d(X.columns, X_test.columns)
mis_levels.tolist()
X_test[mis_levels] = 0
X_test = X_test.drop(['Id'], axis = 1)

# Match feature orders
X_test = X_test.reindex(X.columns, axis = 1)

NN_pred_reduced = model_best[0].predict(X_test)

Preprocessing Steps: 
1. Encoded user specified variables as categorical
2. Encoded user specified variables to be ordinal
3. Imputed missing values using knn with k = 5
4. Categorical columns were convert into n binary dummy variables
5. Numeric columns scaled to mean 0 and unit variance


In [31]:
kaggle_NN_2 = pd.DataFrame()
kaggle_NN_2['Id'] = test_raw['Id']
kaggle_NN_2['SalePrice'] = NN_pred_reduced

kaggle_NN_2.to_csv("NN_Kaggle_Submission_reduced.csv", index = False)

In [10]:
# Reduce the hyperparameter space
def build_model_2(hp):
    model = keras.Sequential()
    # Input Layer
    model.add(keras.Input(shape = (531, )))

    # Tune the number of hidden layers
    for i in range(hp.Int("num_layers", min_value = 2, max_value = 4, step = 1)):
        model.add(
            # Tune the number of nodes in each layer
            layers.Dense(
                units = hp.Int(f"units{i}", min_value = 20, max_value = 50, step = 5), 
                # Tune activation function between layers
                activation = hp.Choice("activation", ["relu", "LeakyReLU", "linear"]),
                    #Note: I believe None = linear activation
                # Tune weight penalty
                kernel_regularizer = keras.regularizers.L1L2(
                    l1 = hp.Float("lasso", min_value = 0, max_value = 5, step = 0.5), 
                    l2 = hp.Float("ridge", min_value = 0, max_value = 5, step = 0.5)     
                ),
                # Tune bias penalty
                bias_regularizer = keras.regularizers.L1L2(
                    l1 = hp.Float("lasso", min_value = 0, max_value = 5, step = 0.5), 
                    l2 = hp.Float("ridge", min_value = 0, max_value = 5, step = 0.5) 
                ), 
            )
        )

    # Tune Dropout rate
    model.add(layers.Dropout(rate = hp.Float("Dropout", min_value = 0, max_value = 0.5, step = 0.1)))

    # Output Layer
    model.add(layers.Dense(1, activation = 'relu'))

    # Learning rate schedule with decay. 
    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
        boundaries = [500], values = [0.01, 0.001])

    # Model Compiler
    model.compile(
        loss = keras.losses.MeanSquaredError(), 

        # Tune optimization 
        # SGD caused nan RMSE
        optimizer = keras.optimizers.RMSprop(learning_rate = learning_rate_fn, 
            momentum = hp.Float("momentum", min_value = 0, max_value = 0.5, step = 0.1)
        ), 

        metrics = [keras.metrics.RootMeanSquaredError(name = 'rmse')]
    )

    return model 

build_model_2(keras_tuner.HyperParameters())

<keras.engine.sequential.Sequential at 0x15767b20b50>

In [19]:
# Hyperband Grid of Hyperparameters

tuner_2 = keras_tuner.Hyperband(
    hypermodel = build_model_2, 
    objective = "val_loss", 
    max_epochs = 1000, 
    overwrite = True,
    seed = 123,
    directory = "Hyperband",
    project_name = "test_hyband_9"
)

tuner_2.search_space_summary()

Search space summary
Default search space size: 8
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 4, 'step': 1, 'sampling': None}
units0 (Int)
{'default': None, 'conditions': [], 'min_value': 20, 'max_value': 50, 'step': 5, 'sampling': None}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'LeakyReLU', 'linear'], 'ordered': False}
lasso (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 5.0, 'step': 0.5, 'sampling': None}
ridge (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 5.0, 'step': 0.5, 'sampling': None}
units1 (Int)
{'default': None, 'conditions': [], 'min_value': 20, 'max_value': 50, 'step': 5, 'sampling': None}
Dropout (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': None}
momentum (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': None}


In [20]:
import time # creates unique file names based on system time

# Define early stop function
stop_early = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5)

# Define Tensorboard log 
Board_Log_Name = f"Tensorboard_Log_{int(time.time())}"
tensorboard = TensorBoard(log_dir=f"logs/{Board_Log_Name}")

In [21]:
# Apply Hyperband Algo
tuner_2.search(X, Y, epochs = 1000, validation_split = 0.1, 
    callbacks = [stop_early, tensorboard])

Trial 2074 Complete [00h 00m 03s]
val_loss: 596423360.0

Best val_loss So Far: 427372384.0
Total elapsed time: 01h 01m 28s
INFO:tensorflow:Oracle triggered exit


In [26]:
tuner_2.results_summary()

Results summary
Results in Hyperband\test_hyband_9
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000001570F7AC970>
Trial summary
Hyperparameters:
num_layers: 4
units0: 25
activation: relu
lasso: 4.0
ridge: 4.0
units1: 35
Dropout: 0.30000000000000004
momentum: 0.30000000000000004
units2: 40
units3: 25
tuner/epochs: 38
tuner/initial_epoch: 13
tuner/bracket: 5
tuner/round: 2
tuner/trial_id: 1584
Score: 427372384.0
Trial summary
Hyperparameters:
num_layers: 4
units0: 30
activation: relu
lasso: 3.5
ridge: 2.0
units1: 20
Dropout: 0.1
momentum: 0.4
units2: 40
units3: 30
tuner/epochs: 334
tuner/initial_epoch: 112
tuner/bracket: 3
tuner/round: 2
tuner/trial_id: 1989
Score: 450141888.0
Trial summary
Hyperparameters:
num_layers: 4
units0: 30
activation: relu
lasso: 1.0
ridge: 0.0
units1: 40
Dropout: 0.2
momentum: 0.4
units2: 25
units3: 45
tuner/epochs: 112
tuner/initial_epoch: 38
tuner/bracket: 5
tuner/round: 3
tuner/trial_id: 1686
Score: 450942592.0
Trial summary
Hyp

In [27]:
# Extract Best Model
model_best_2 = tuner_2.get_best_models(num_models = 1)
model_best_2[0].summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 25)                13300     
                                                                 
 dense_1 (Dense)             (None, 35)                910       
                                                                 
 dense_2 (Dense)             (None, 40)                1440      
                                                                 
 dense_3 (Dense)             (None, 25)                1025      
                                                                 
 dropout (Dropout)           (None, 25)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 26        
                                                                 
Total params: 16,701
Trainable params: 16,701
Non-traina

In [4]:
# Save tuner_2 object
import pickle
with open('data/tuner_2.pkl', 'wb') as pickle_file:
    pickle.dump(tuner_2, pickle_file)

In [16]:
#with open('data/tuner_2.pkl', 'rb') as pickle_file:
    #tuner_2_reload = pickle.load(pickle_file)

In [22]:
# Predicting on Test Data

model_best_2 = tuner_2.get_best_models(1)

# Cleaning Testing data for NN
test_NN = recipe_NN(test_raw, rep = False)

# Match training a testing columns
# Match Drops 
train_drops = np.setdiff1d(test_NN.columns, train_NN_clean.columns)
train_drop = train_drops.tolist()
X_test = test_NN.drop(train_drops, axis = 1)

# Add 0's for missing factor levels 
mis_levels = np.setdiff1d(X.columns, X_test.columns)
mis_levels.tolist()
X_test[mis_levels] = 0
X_test = X_test.drop(['Id'], axis = 1)

# Match feature orders
X_test = X_test.reindex(X.columns, axis = 1)

NN_pred_tuned = model_best_2[0].predict(X_test)

Preprocessing Steps: 
1. Encoded user specified variables as categorical
2. Encoded user specified variables to be ordinal
3. Imputed missing values using knn with k = 5
4. Categorical columns were convert into n binary dummy variables
5. Numeric columns scaled to mean 0 and unit variance


In [23]:
kaggle_NN_tuned = pd.DataFrame()
kaggle_NN_tuned['Id'] = test_raw['Id']
kaggle_NN_tuned['SalePrice'] = NN_pred_tuned

kaggle_NN_tuned.to_csv("NN_Kaggle_Submission_final.csv", index = False)