## House Price Predictor using Different Models
In this notebook, I will use different models to create House Price Predictor and evaluate their performance.

## Import Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sklearn
from sklearn import metrics
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

## Import Datasets

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")


## Common Functions

**Evaluation Function**

In [None]:
def evaluate(model, x_val, y_val):
    y_pred = model.predict(x_val)
    r2 = metrics.r2_score(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    msle = metrics.mean_squared_log_error(y_val, y_pred)
    mape = np.mean(tf.keras.metrics.mean_absolute_percentage_error(y_val, y_pred).numpy())
    rmse = np.sqrt(mse)
    rmlse_score = rmlse(y_val, y_pred).numpy()
    print("R2 Score:", r2)
    print("MSE:", mse)
    print("MAE:", mae)
    print("MSLE:", msle)
    print("MAPE", mape)
    print("RMSE:", rmse)
    print("RMLSE", rmlse_score)
    return {"r2": r2, "mse": mse, "mae": mae, "msle": msle, "mape": mape, "rmse": rmse, "rmlse": rmlse_score}

**Export Results**

In [None]:
def export_result(model, df, file_path, features = None):
    if features == None:
        x = df
    else:
        x = df[features]
    SalePrice = model.predict(x)
    submission = pd.DataFrame({"Id": df["Id"], "SalePrice": SalePrice.reshape(-1)})
    submission.to_csv(file_path, index=False)

**Root Mean Squared Logarithmic Error**

In [None]:
def rmlse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.math.log(y_pred + 1) - tf.math.log(y_true + 1))))

## Exploratory Data Analysis

**First 5 rows**

In [None]:
train.head()

**Its shape**

In [None]:
train.shape

**Statistic infos**

In [None]:
train.info()

In [None]:
train[train.columns[train.dtypes==object]].value_counts()

In [None]:
train.describe()

**Correlation scores**

In [None]:
correlation_scores = train.corr()
correlation_scores

**Factors that impact house price most**

In [None]:
train.corr()["SalePrice"].sort_values(key = lambda x: abs(x), ascending=False)

## Data Cleaning

**Features that contains missing values**

In [None]:
null_counts = train.isnull().sum()
null_counts[null_counts > 0]

In [None]:
null_columns = list(pd.DataFrame(null_counts[null_counts > 0]).index)

**Features that has missing values**

In [None]:
train[null_columns].dtypes

**Solve Missing values for Training Set**

I will use following strategies to apply imputation to missing values. 
- For numerical columns, I will replace missing value with their median value.
- For categorical columns, I will replace missing value with unknown, which is a new category.

In [None]:
for column in null_columns:
    if train[column].dtype == object:
        train[column] = train[[column]].replace(np.NAN, "Unknown")
    else:
        train[column] = train[column].replace(np.NAN, train[column].median())

**Do the same for Test data set**

In [None]:
null_counts = test.isnull().sum()
null_counts[null_counts > 0]
null_columns = list(pd.DataFrame(null_counts[null_counts > 0]).index)
for column in null_columns:
    if test[column].dtype == object:
        test[column] = test[[column]].replace(np.NAN, test[column].mode()[0])
    else:
        test[column] = test[column].replace(np.NAN, test[column].median())

**Now the data are all non-nulls**

In [None]:
train.info()

**Convert categorical features to one hot vector**

In [None]:
train_test_dummied = pd.get_dummies(pd.concat([train, test]))

In [None]:
train_test_dummied.head()

In [None]:
mean_value = train_test_dummied.mean()
std_value = train_test_dummied.std()
print(mean_value)
print(std_value)

In [None]:
mean_value.pop("SalePrice")
std_value.pop("SalePrice")

In [None]:
train_dummied = train_test_dummied.iloc[0: len(train)]
test_dummied = train_test_dummied.iloc[len(train):]
_ = test_dummied.pop("SalePrice")
train_dummied.head()

In [None]:
test_dummied.head()

**Train Validation Split**

In [None]:
train_dummied, val_dummied = train_test_split(train_dummied, test_size=0.2, random_state=np.random.randint(1, 1000))

**Calculate Correlated Features**

In [None]:
train_dummied.corr()

In [None]:
correlated_scores = train_dummied.corr()["SalePrice"]
correlated_scores = correlated_scores[correlated_scores.abs() > 0.2]
correlated_features = list(correlated_scores.index)
correlated_features.remove("SalePrice")
correlated_features

In [None]:
y_train = train_dummied.pop("SalePrice")
x_train_dummied = train_dummied
y_val = val_dummied.pop("SalePrice")
x_val_dummied = val_dummied

## Model Development and Evaluation

### Train Linear Regression Model with features that's correlated to house price

In [None]:
linear_regression_2 = LinearRegression()
linear_regression_2.fit(x_train_dummied[correlated_features], y_train)
linear_regression_2_results = evaluate(linear_regression_2, x_val_dummied[correlated_features], y_val)

In [None]:
export_result(linear_regression_2, test_dummied, "submission_linear_regression_2.csv", features=correlated_features)

### Train Ridge Model with all features

In [None]:
from sklearn.linear_model import Ridge
ridge1 = Ridge()
ridge1.fit(x_train_dummied, y_train)
ridge1_results = evaluate(ridge1, x_val_dummied, y_val)

In [None]:
export_result(ridge1, test_dummied, "submission_ridge1.csv")

### Train Ridge Model with correlated features

In [None]:
from sklearn.linear_model import Ridge
ridge2 = Ridge()
ridge2.fit(x_train_dummied[correlated_features], y_train)
ridge2_results = evaluate(ridge2, x_val_dummied[correlated_features], y_val)

In [None]:
export_result(ridge2, test_dummied, "submission_ridge2.csv", features=correlated_features)

## Train Deep Neural Network Model with all features¶

In [None]:
neural_network_model_1 = tf.keras.Sequential([
    tf.keras.Input(shape=(x_train_dummied.shape[1])),
    tf.keras.layers.Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2()),
    tf.keras.layers.Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2()),
    tf.keras.layers.Dense(1)
])
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_rmlse", patience=20)
checkpoint_path = "neural_network_model_1.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor="val_rmlse", save_best_only=True)
neural_network_model_1.compile(loss="mse", optimizer="adam", metrics=[
    "mse", "mae", "mape", rmlse
])
history = neural_network_model_1.fit(
    x_train_dummied, y_train, epochs=100, 
    validation_data=(x_val_dummied, y_val),
    callbacks=[early_stop, checkpoint],
    verbose=2
)

In [None]:
pd.DataFrame(history.history, columns=["loss", "val_loss"]).plot()

In [None]:
pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()

In [None]:
pd.DataFrame(history.history, columns=["rmlse", "val_rmlse"]).plot()

In [None]:
export_result(neural_network_model_1, test_dummied, "submission_neural_network_model_1.csv")

### Train Deep Neural Network Model with with correlated features

In [None]:
x_train_subset = x_train_dummied[correlated_features]
x_val_subset = x_val_dummied[correlated_features]
neural_network_model_2 = tf.keras.Sequential([
    tf.keras.Input(shape=(x_train_subset.shape[1])),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2()),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2()),
    tf.keras.layers.Dense(1)
])
early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
neural_network_model_2_checkpoint = "neural_network_model_2.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(neural_network_model_2_checkpoint, save_best_only=True)
neural_network_model_2.compile(loss="mse", optimizer="adam", metrics=[
    "mse", "mae", "mape", rmlse
])
history = neural_network_model_2.fit(
    x_train_subset, y_train, epochs=100, 
    validation_data=(x_val_subset, y_val),
    callbacks=[early_stop, checkpoint]
)

**Train Deep Neural Network Model with with correlated scaled features**

In [None]:
x_train_subset = x_train_dummied[correlated_features]
x_val_subset = x_val_dummied[correlated_features]
x_train_subset_scaled = (x_train_subset - mean_value[correlated_features]) / std_value[correlated_features]
x_val_subset_scaled = (x_val_subset - mean_value[correlated_features]) / std_value[correlated_features]
test_dummied_scaled = (test_dummied[correlated_features] - mean_value[correlated_features]) / std_value[correlated_features]

In [None]:
model_dataframe = pd.DataFrame({
        "num_hidden_layers": [
            4, 5, 5, 5
        ], 
        "bottom_hidden_layer_size": [2, 8, 2, 16],
        "hidden_layer_size_growth_strategy": ["double", "same", "double", "same"],
        "hidden_layer_activation": ["relu", "relu", "relu", "relu"],
        "hidden_layer_dropout": [0.5, 0.3, 0.4, 0.2],
        "kernel_regularizer": ["l1", "l2", "l1_l2", "l2"],
        "model_path": ["dnn1.h5", "dnn2.h5", "dnn3.h5", "dnn4.h5"],
        "csv_path": ["dnn1.csv", "dnn2.csv", "dnn3.csv", "dnn4.csv"],
        "val_mse": [0.0, 0.0, 0.0, 0.0],
        "val_mae": [0.0, 0.0, 0.0, 0.0],
        "val_mape": [0.0, 0.0, 0.0, 0.0],
        "val_rmlse": [0.0, 0.0, 0.0, 0.0]
})

In [None]:
model_dataframe.head()

In [None]:
import matplotlib.pyplot as plt
def train_model(
    x_train,
    y_train,
    x_val,
    y_val,
    test, 
    Id,
    df
):
    for i in range(len(df)):
        num_hidden_layers = df.loc[i, "num_hidden_layers"]
        bottom_hidden_layer_size = df.loc[i, "bottom_hidden_layer_size"]
        hidden_layer_size_growth_strategy = df.loc[i, "hidden_layer_size_growth_strategy"]
        hidden_layers = []
        if hidden_layer_size_growth_strategy == "same":
            hidden_layers = [bottom_hidden_layer_size] * num_hidden_layers
        if hidden_layer_size_growth_strategy == "double":
            hidden_layers = list(bottom_hidden_layer_size * np.power(2, np.arange(4)))
            hidden_layers.reverse()
        hidden_layers_activation = df.loc[i, "hidden_layer_activation"]
        hidden_layer_dropout = df.loc[i, "hidden_layer_dropout"]
        kernel_regularizer = df.loc[i, "kernel_regularizer"]
        model_path = df.loc[i, "model_path"]
        print(model_path)
        csv_path = df.loc[i, "csv_path"]
        if kernel_regularizer == "l1":
            kernel_regularizer = tf.keras.regularizers.l1()
        elif kernel_regularizer == "l2":
            kernel_regularizer = tf.keras.regularizers.l2()
        elif kernel_regularizer == "l1_l2":
            kernel_regularizer = tf.keras.regularizers.l1_l2()
        tf.keras.backend.clear_session()
        model = tf.keras.Sequential()
        model.add(tf.keras.Input(shape=(x_train.shape[1])))
        for index, layer in enumerate(hidden_layers):
            model.add(
                tf.keras.layers.Dense(
                    layer, 
                    activation=hidden_layers_activation, 
                    kernel_regularizer=kernel_regularizer
                )
            )
            if hidden_layer_dropout != 0:
                model.add(
                    tf.keras.layers.Dropout(hidden_layer_dropout)
                )
        model.add(tf.keras.layers.Dense(1))
        early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
        checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
        model.compile(loss="mse", optimizer="adam", metrics=[
            "mse", "mae", "mape", rmlse
        ])
        history = model.fit(
            x_train, y_train, epochs=100, 
            validation_data=(x_val, y_val),
            callbacks=[early_stop, checkpoint],
            verbose=0
        )
        model.load_weights(model_path)
        best_index = np.argmin(history.history["val_rmlse"])
        metrics=["mse", "mae", "mape", "rmlse"]
        for metric in metrics:
            val_metric = "val_" + metric
            df.loc[i, val_metric] = history.history[val_metric][best_index]
            print(val_metric, history.history[val_metric][best_index])
            pd.DataFrame(history.history, columns=[metric, val_metric]).plot()
            plt.show()
        SalePrice = model.predict(test)
        submission = pd.DataFrame({"Id": Id, "SalePrice": SalePrice.reshape(-1)})
        submission.to_csv(csv_path, index=False)

In [None]:
train_model(
    x_train_subset_scaled,
    y_train,
    x_val_subset_scaled,
    y_val,
    test_dummied_scaled, 
    test["Id"],
    model_dataframe
)

## Best DNN Model

In [None]:
model_dataframe.head()

In [None]:
model_dataframe.sort_values(ascending=True, by="val_rmlse", inplace=True)
model_dataframe[["model_path", "val_mse" , "val_mae", "val_mape", "val_rmlse"]].head()

In [None]:
submission = pd.read_csv(model_dataframe.loc[0, "csv_path"])
submission.to_csv("submission.csv", index=False)