#### Imports

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#### Load Data

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')


#### Data Cleaning

In [3]:
train = train.drop('Id', axis = 1)

In [4]:
print(train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

#### Data Preprocessing

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
def preprocess(dataframe, cat_columns, num_columns):
    encoder = OneHotEncoder(sparse_output = False,
                            handle_unknown = 'ignore')
    scaler = StandardScaler()

    #encode categorical columns
    encoded_cols = encoder.fit_transform(dataframe[cat_columns])
    encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(cat_columns))
    dataframe = pd.concat([dataframe.reset_index(drop = True), encoded_df.reset_index(drop = True)], axis = 1)

    #scale numerical columns
    scaled_cols = scaler.fit_transform(dataframe[num_columns])
    scaled_df = pd.DataFrame(scaled_cols, columns = num_columns)
    dataframe = pd.concat([dataframe.reset_index(drop = True), scaled_df.reset_index(drop = True)], axis = 1)

    #drop original columns
    dataframe.drop(columns = cat_columns + num_columns, inplace = True)
    return dataframe

cat_cols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
            'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st',
            'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
            'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
            'MoSold', 'YrSold', 'SaleCondition']
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
            'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal']

train = preprocess(train, cat_cols, num_cols)
test = preprocess(test, cat_cols, num_cols)



In [6]:
print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

Index(['SalePrice', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'YrSold_2007', 'YrSold_2008', 'YrSold_2009', 'YrSold_2010',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=422)
Index(['Id', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'YrSold_2007', 'YrSold_2008', 'YrSold_2009', 'YrSold_2010',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=413)
(1460, 422)
(1459, 413)


In [7]:
missing_cols = set(test.columns) - set(train.columns)
print(missing_cols)
print(len(missing_cols))

{'GarageYrBlt_1895.0', 'GarageCars_4.0', 'GarageCars_nan', 'Utilities_nan', 'GarageCars_3.0', 'Exterior2nd_nan', 'Functional_nan', 'Exterior1st_nan', 'GarageYrBlt_2207.0', 'GarageCars_5.0', 'GarageYrBlt_1943.0', 'KitchenQual_nan', 'MSZoning_nan', 'GarageCars_0.0', 'GarageCars_2.0', 'GarageYrBlt_1917.0', 'MSSubClass_150', 'SaleType_nan', 'GarageCars_1.0', 'GarageYrBlt_1896.0', 'Id', 'GarageYrBlt_1919.0'}
22


In [8]:
for col in missing_cols:
    train[col] = 0

missing_cols = set(test.columns) - set(train.columns)
print(len(missing_cols))

0


In [9]:
print(train.columns)

Index(['SalePrice', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'MSZoning_nan', 'GarageCars_0.0', 'GarageCars_2.0',
       'GarageYrBlt_1917.0', 'MSSubClass_150', 'SaleType_nan',
       'GarageCars_1.0', 'GarageYrBlt_1896.0', 'Id', 'GarageYrBlt_1919.0'],
      dtype='object', length=444)


#### Modeling

In [10]:
from sklearn.model_selection import train_test_split
#create train test split for model
X = train.drop(columns = 'SalePrice')
y = train['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras import backend as K

# Define HyperModel
class HousePriceHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Input(shape=(X_train.shape[1],)))
        model.add(layers.BatchNormalization())

        for i in range(hp.Int('num_layers', 1, 7)):
            model.add(layers.Dense(units=hp.Int('units_' + str(i), 
                                                min_value=256,
                                                max_value=1024,
                                                step=256),
                                   activation='relu',
                                   kernel_regularizer=regularizers.l2(hp.Float('l2_regularization', 
                                                                               min_value=1e-5, 
                                                                               max_value=1e-1, 
                                                                               sampling='LOG'))))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(hp.Float('dropout_' + str(i),
                                              min_value=0.2,
                                              max_value=0.5,
                                              step=0.1)))
        model.add(layers.Dense(1, activation='linear'))

        model.compile(
            optimizer=keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', 
                                       min_value=1e-5,
                                       max_value=1e-2,
                                       sampling='LOG')
            ),
            loss='mean_squared_error',
            metrics=[RootMeanSquaredError()]
        )
        return model
    
tuner = RandomSearch(
    HousePriceHyperModel(),
    objective = 'val_root_mean_squared_error',
    max_trials = 25,
    executions_per_trial = 5,
    directory = 'House Price Tuning',
    project_name = 'House Price Prediction'
)

MY_PATIENCE = 10
MY_EPOCHS = 50

tuner.search(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs = MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(patience = MY_PATIENCE, restore_best_weights=True)]
)

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparams = tuner.get_best_hyperparameters(num_trials=1)[0]

best_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error', RootMeanSquaredError()])
best_model.evaluate(X_valid, y_valid)

Trial 8 Complete [00h 01m 50s]
val_root_mean_squared_error: 199119.93125

Best val_root_mean_squared_error So Far: 198934.703125
Total elapsed time: 00h 18m 17s

Search: Running Trial #9

Value             |Best Value So Far |Hyperparameter
5                 |2                 |num_layers
1024              |1024              |units_0
4.6531e-05        |0.00010456        |l2_regularization
0.5               |0.4               |dropout_0
0.00080313        |0.0001636         |learning_rate
1024              |512               |units_1
0.3               |0.4               |dropout_1
768               |1024              |units_2
0.5               |0.3               |dropout_2
512               |1024              |units_3
0.5               |0.4               |dropout_3
1024              |1024              |units_4
0.3               |0.3               |dropout_4

Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - loss: 40866897920.0000 - root_mean_squared_e

#### Plot RMSE of Best Model

In [None]:
history = best_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs = MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(patience = MY_PATIENCE, restore_best_weights=True)]
)

import matplotlib.pyplot as plt

plt.plot(history.history['root_mean_squared_error'], label='Train RMSE')
plt.plot(history.history['root_mean_squared_error'], label='Validation RMSE')
plt.xlabel('Epochs')
plt.ylabel('RMSE Score')
plt.legend()
plt.show()

#### Output Predictions

In [None]:
predictions = best_model.predict(test)

output_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})