#### Setup

In [None]:
%pip install -q numpy pandas matplotlib seaborn scikit-learn tensorflow keras

#### Imports

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#### Load Data

In [54]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')


#### Data Cleaning

In [55]:
train = train.drop('Id', axis = 1)

In [56]:
print(train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [57]:
missing_counts = train.isnull().sum()
columns_with_missing = missing_counts[missing_counts > 0].index
print(columns_with_missing)

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


In [58]:
missing_counts = test.isnull().sum()
columns_with_missing = missing_counts[missing_counts > 0].index
print(columns_with_missing)

Index(['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
       'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'],
      dtype='object')


In [59]:
#R_W null count
r_w_null = test.isnull().sum(axis = 1)
rows_with_nulls = r_w_null[r_w_null > 0]
print(rows_with_nulls)

0        5
1        4
2        4
3        4
4        6
        ..
1454    11
1455     6
1456     5
1457     9
1458     4
Length: 1459, dtype: int64


In [60]:
ids = test.pop('Id')


In [61]:
def handle_null_na (data):
    #train columns
    data['LotFrontage'] = data['LotFrontage'].fillna(0)
    data['Alley'] = data['Alley'].fillna('NA')
    data['MasVnrArea'] = data['MasVnrArea'].fillna(0)
    data['MasVnrType'] = data['MasVnrType'].fillna('None')
    data['BsmtQual'] = data['BsmtQual'].fillna('NA')
    data['BsmtCond'] = data['BsmtCond'].fillna('NA')
    data['BsmtExposure'] = data['BsmtExposure'].fillna('NA')
    data['BsmtFinType1'] = data['BsmtFinType1'].fillna('NA')
    data['BsmtFinType2'] = data['BsmtFinType2'].fillna('NA')
    data['Electrical'] = data['Electrical'].fillna('SBrkr')
    data['FireplaceQu'] = data['FireplaceQu'].fillna('NA')
    data['GarageType'] = data['GarageType'].fillna('NA')
    data['GarageYrBlt'] = data['GarageYrBlt'].fillna(-1)
    data['GarageFinish'] = data['GarageFinish'].fillna('NA')
    data['GarageQual'] = data['GarageQual'].fillna('NA')
    data['GarageCond'] = data['GarageCond'].fillna('NA')
    data['PoolQC'] = data['PoolQC'].fillna('NA')
    data['Fence'] = data['Fence'].fillna('NA')
    data['MiscFeature'] = data['MiscFeature'].fillna('NA')

    

    return data

train = handle_null_na(train)
test = handle_null_na(test)


In [62]:
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
22


Null/NA value fill Reasons:
Most of the missing values appeared to be because of mishandled NA or none qualifiers so I just manually filled missing with strings, additionally if a house didn't have a feature its area was left as NA and I chose to just make it 0 since area of 0 doesn't exist, or a year of -1.

#### Feature Engineering

In [63]:
def add_features(data):
    data['HouseAge'] = data['YrSold'] - data['YearBuilt']
    data['RemodAge'] = data.apply(lambda row: row['YrSold'] - row['YearRemodAdd'] if row['YearRemodAdd'] != 0 else 0, axis=1)
    data['GarageAge'] = data.apply(lambda row: row['YrSold'] - row['GarageYrBlt'] if row['GarageYrBlt'] != 0 else 0, axis=1)
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

    return data

train = add_features(train)
test = add_features(test)

#### Data Preprocessing

In [64]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
def preprocess(dataframe, cat_columns, num_columns):
    encoder = OneHotEncoder(sparse_output = False,
                            handle_unknown = 'ignore')
    scaler = StandardScaler()

    #encode categorical columns
    encoded_cols = encoder.fit_transform(dataframe[cat_columns])
    encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(cat_columns))
    dataframe = pd.concat([dataframe.reset_index(drop = True), encoded_df.reset_index(drop = True)], axis = 1)

    #scale numerical columns
    scaled_cols = scaler.fit_transform(dataframe[num_columns])
    scaled_df = pd.DataFrame(scaled_cols, columns = num_columns)
    dataframe = pd.concat([dataframe.reset_index(drop = True), scaled_df.reset_index(drop = True)], axis = 1)

    #drop original columns
    dataframe.drop(columns = cat_columns + num_columns, inplace = True)
    return dataframe

cat_cols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
            'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st',
            'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
            'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
            'MoSold', 'YrSold', 'SaleCondition']
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
            'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal', 'GarageYrBlt', 'HouseAge', 'RemodAge', 'GarageAge', 'TotalSF']

processed_train = preprocess(train, cat_cols, num_cols)
processed_test = preprocess(test, cat_cols, num_cols)



In [67]:
for col in missing_cols:
    processed_train[col] = 0

missing_cols = set(processed_test.columns) - set(processed_train.columns)
print(len(missing_cols))

0


#### Modeling

In [68]:
from sklearn.model_selection import train_test_split
#create train test split for model
X = processed_train.drop(columns = 'SalePrice')
y = processed_train['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

train_tensor = tf.convert_to_tensor(X_train, dtype = tf.float32)
valid_tensor = tf.convert_to_tensor(X_valid, dtype = tf.float32)

train_labels = tf.convert_to_tensor(y_train, dtype = tf.float32)
valid_labels = tf.convert_to_tensor(y_valid, dtype = tf.float32)

In [72]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from kerastuner import HyperModel
from kerastuner.tuners import Hyperband
from tensorflow.keras.metrics import RootMeanSquaredError

class HousePriceHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Input(shape=(X_train.shape[1],)))
        model.add(layers.BatchNormalization())

        for i in range(hp.Int('num_layers', 1, 5)):
            units = hp.Int('units_' + str(i), 
                           min_value=32,
                           max_value=128,
                           step=32)
            # Define activation as a hyperparameter
            activation = hp.Choice('activation_' + str(i), values=['relu', 'tanh', 'sigmoid', 'elu', 'leaky_relu'])
            
            # Add regularization
            kernel_regularizer = regularizers.l2(hp.Float('l2_regularization_' + str(i), 
                                                          min_value=1e-7, 
                                                          max_value=1e-2, 
                                                          sampling='LOG'))

            model.add(layers.Dense(units=units,
                                   activation=activation,
                                   kernel_regularizer=kernel_regularizer))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(hp.Float('dropout_' + str(i),
                                              min_value=0.2,
                                              max_value=0.9,
                                              step=0.1)))
        model.add(layers.Dense(1, activation='linear'))

        model.compile(
            optimizer=keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', 
                                       min_value=1e-7,
                                       max_value=1e-3,
                                       sampling='LOG')
            ),
            loss='mse',
            metrics=['mean_absolute_error', RootMeanSquaredError()]
        )
        return model
    
tuner = Hyperband(
    HousePriceHyperModel(),
    objective = 'val_mean_absolute_error',
    max_epochs = 50,
    factor = 3,
    directory = 'House Price Tuning',
    project_name = 'House Price Prediction',
    overwrite = True
)

#********************#
MY_PATIENCE = 10
MY_EPOCHS = 250
MY_MIN_DELTA = 0.001
#********************#

tuner.search(
    X_train, y_train,
    validation_data = (X_valid, y_valid),
    epochs = MY_EPOCHS,
    callbacks = [keras.callbacks.EarlyStopping(patience=MY_PATIENCE, min_delta=MY_MIN_DELTA, restore_best_weights=True)],
    verbose = 1
)

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparams = tuner.get_best_hyperparameters(num_trials=1)[0]

best_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error', RootMeanSquaredError()])
best_model.evaluate(X_valid, y_valid)

Trial 2 Complete [00h 00m 04s]
val_mean_absolute_error: 176594.65625

Best val_mean_absolute_error So Far: 176594.65625
Total elapsed time: 00h 00m 07s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
3                 |3                 |num_layers
128               |128               |units_0
elu               |relu              |activation_0
0.00013465        |1.7883e-07        |l2_regularization_0
0.2               |0.5               |dropout_0
0.00012902        |1.7691e-05        |learning_rate
96                |32                |units_1
relu              |relu              |activation_1
2.2683e-06        |1e-07             |l2_regularization_1
0.6               |0.2               |dropout_1
96                |32                |units_2
elu               |relu              |activation_2
1.0305e-06        |1e-07             |l2_regularization_2
0.4               |0.2               |dropout_2
2                 |2                 |tuner/epochs
0      

#### Plots of model info

In [69]:
history = best_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(patience=MY_PATIENCE, min_delta=MY_MIN_DELTA, restore_best_weights=True)]
)


sns.set_style('whitegrid')
sns.set_context('notebook')

plt.figure(figsize=(10, 6))
sns.lineplot(data=history.history['root_mean_squared_error'], label='Train RMSE', marker='o')
sns.lineplot(data=history.history['val_root_mean_squared_error'], label='Validation RMSE', marker='o')


plt.xlabel('Epochs', fontsize=14)
plt.ylabel('RMSE Score', fontsize=14)
plt.title('Training and Validation RMSE over Epochs', fontsize=16)
plt.legend(fontsize=12)
plt.show()

NameError: name 'best_model' is not defined

In [None]:
#Plot Training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
#last 10 epochs
last_10_epochs = range(len(history.history['loss']) - 10, len(history.history['loss']))
last_10_train_loss = history.history['loss'][-10:]
last_10_val_loss = history.history['val_loss'][-10:]
last_10_train_rmse = history.history['root_mean_squared_error'][-10:]
last_10_val_rmse = history.history['val_root_mean_squared_error'][-10:]

fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting loss
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.plot(last_10_epochs, last_10_train_loss, 'b-', label='Train Loss')
ax1.plot(last_10_epochs, last_10_val_loss, 'b--', label='Validation Loss')
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')

# Creating a second y-axis for RMSE
ax2 = ax1.twinx()
ax2.set_ylabel('RMSE')
ax2.plot(last_10_epochs, last_10_train_rmse, 'r-', label='Train RMSE')
ax2.plot(last_10_epochs, last_10_val_rmse, 'r--', label='Validation RMSE')
ax2.tick_params(axis='y')
ax2.legend(loc='upper right')

fig.tight_layout()
plt.title('Loss and RMSE for the Last 10 Epochs')
plt.show()

In [None]:
#actual vs predicted values
y_preds = best_model.predict(X_valid)

plt.figure(figsize=(10, 6))
plt.scatter(y_valid, y_preds, alpha=0.5)
plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted House Prices')
plt.show()

#### Output Predictions

In [None]:
# Verify the shape of X_train and test
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of test: {processed_test.shape}")

missing_features = set(X_train.columns) - set(processed_test.columns)
print(f"Missing features: {missing_features}")

for features in missing_features:
    processed_test[features] = 0

In [None]:
predictions = best_model.predict(processed_test)

predictions = predictions.flatten()
output_df = pd.DataFrame({'Id': ids, 'SalePrice': predictions})
output_df.to_csv('NN_predictions.csv', index=False)