#### Imports

In [14]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#### Load Data

In [15]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')


#### Data Cleaning

In [16]:
train = train.drop('Id', axis = 1)
test = test.drop('Id', axis = 1)

test['SalePrice'] = 0

In [17]:
print(train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

#### Data Preprocessing

In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
def preprocess(dataframe, cat_columns, num_columns):
    encoder = OneHotEncoder(sparse_output = False,
                            handle_unknown = 'ignore')
    scaler = StandardScaler()

    #encode categorical columns
    encoded_cols = encoder.fit_transform(dataframe[cat_columns])
    encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(cat_columns))
    dataframe = pd.concat([dataframe.reset_index(drop = True), encoded_df.reset_index(drop = True)], axis = 1)

    #scale numerical columns
    scaled_cols = scaler.fit_transform(dataframe[num_columns])
    scaled_df = pd.DataFrame(scaled_cols, columns = num_columns)
    dataframe = pd.concat([dataframe.reset_index(drop = True), scaled_df.reset_index(drop = True)], axis = 1)

    #drop original columns
    dataframe.drop(columns = cat_columns + num_columns, inplace = True)
    return dataframe

cat_cols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
            'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st',
            'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
            'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
            'MoSold', 'YrSold', 'SaleCondition']
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
            'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal']

train = preprocess(train, cat_cols, num_cols)
test = preprocess(test, cat_cols, num_cols)



In [19]:
print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

Index(['SalePrice', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'YrSold_2007', 'YrSold_2008', 'YrSold_2009', 'YrSold_2010',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=422)
Index(['SalePrice', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'YrSold_2007', 'YrSold_2008', 'YrSold_2009', 'YrSold_2010',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=413)
(1460, 422)
(1459, 413)


In [20]:
missing_cols = set(test.columns) - set(train.columns)
print(missing_cols)
print(len(missing_cols))

{'GarageCars_2.0', 'GarageYrBlt_1896.0', 'SaleType_nan', 'GarageCars_0.0', 'GarageCars_5.0', 'Exterior1st_nan', 'GarageYrBlt_2207.0', 'GarageYrBlt_1943.0', 'GarageCars_nan', 'Functional_nan', 'KitchenQual_nan', 'GarageYrBlt_1895.0', 'GarageYrBlt_1919.0', 'GarageCars_1.0', 'Exterior2nd_nan', 'GarageCars_4.0', 'MSSubClass_150', 'Utilities_nan', 'GarageYrBlt_1917.0', 'MSZoning_nan', 'GarageCars_3.0'}
21


In [21]:
for col in missing_cols:
    train[col] = 0

missing_cols = set(test.columns) - set(train.columns)
print(len(missing_cols))

0


In [22]:
print(train.columns)

Index(['SalePrice', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40',
       'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70',
       'MSSubClass_75', 'MSSubClass_80',
       ...
       'GarageYrBlt_1895.0', 'GarageYrBlt_1919.0', 'GarageCars_1.0',
       'Exterior2nd_nan', 'GarageCars_4.0', 'MSSubClass_150', 'Utilities_nan',
       'GarageYrBlt_1917.0', 'MSZoning_nan', 'GarageCars_3.0'],
      dtype='object', length=443)


#### Modeling

In [23]:
from sklearn.model_selection import train_test_split
#create train test split for model
X = train.drop(columns = 'SalePrice')
y = train['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [42]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras import backend as K

def r2_score(y_true, y_pred):
    y_true = K.cast(y_true, K.floatx())
    y_pred = K.cast(y_pred, K.floatx())
    ss_res = K.sum(K.square(y_true - y_pred))
    ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - ss_res / (ss_tot + K.epsilon())

# Define HyperModel
class HousePriceHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Input(shape=(X_train.shape[1],)))
        model.add(layers.BatchNormalization())

        for i in range(hp.Int('num_layers', 1, 10)):
            model.add(layers.Dense(units=hp.Int('units_' + str(i), 
                                                min_value=256,
                                                max_value=1024,
                                                step=256),
                                   activation='relu',
                                   kernel_regularizer=regularizers.l2(hp.Float('l2_regularization', 
                                                                               min_value=1e-5, 
                                                                               max_value=1e-2, 
                                                                               sampling='LOG'))))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(hp.Float('dropout_' + str(i),
                                              min_value=0.3,
                                              max_value=0.6,
                                              step=0.1)))
        model.add(layers.Dense(1, activation='linear'))

        model.compile(
            optimizer=keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', 
                                       min_value=1e-5,
                                       max_value=1e-3,
                                       sampling='LOG')
            ),
            loss='mean_squared_error',
            metrics=['mean_squared_error', RootMeanSquaredError(), r2_score]
        )
        return model
    
tuner = RandomSearch(
    HousePriceHyperModel(),
    objective='val_r2_score',
    max_trials=10,
    executions_per_trial=2,
    directory='House Price Tuning',
    project_name='House Price Prediction'
)

In [43]:
MY_PATIENCE = 10
MY_EPOCHS = 100

tuner.search(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs = MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(patience = MY_PATIENCE, restore_best_weights=True)]
)

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparams = tuner.get_best_hyperparameters(num_trials=1)[0]

best_model.compile(optimizer='adam', loss='mean_squared_error', metrics=[r2_score])
best_model.evaluate(X_valid, y_valid)

history = best_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs = MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(patience = MY_PATIENCE, restore_best_weights=True)]
)

import matplotlib.pyplot as plt

plt.plot(history.history['r2_score'], label='Train R²')
plt.plot(history.history['val_r2_score'], label='Validation R²')
plt.xlabel('Epochs')
plt.ylabel('R² Score')
plt.legend()
plt.show()

Trial 1 Complete [00h 00m 45s]
val_r2_score: -5.736598968505859

Best val_r2_score So Far: -5.736598968505859
Total elapsed time: 00h 00m 45s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
5                 |3                 |num_layers
1024              |1024              |units_0
0.0027808         |0.003329          |l2_regularization
0.5               |0.5               |dropout_0
3.2257e-05        |1.6925e-05        |learning_rate
768               |256               |units_1
0.3               |0.3               |dropout_1
768               |256               |units_2
0.5               |0.3               |dropout_2

Epoch 1/100
