<a href="https://colab.research.google.com/github/TimH2024/MSC-M5-Project/blob/main/5_Std_dataset_Hyper_Parameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5. Hyper Parameter Tuning Results

# INSTALL LIBRARIES

In [None]:
pip install tensorflow keras keras-tuner numpy pandas scikit-learn



In [None]:
# Core libraries
import numpy as np
import pandas as pd

# Machine learning and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# TensorFlow/Keras for deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Keras Tuner for hyperparameter tuning
from keras_tuner import HyperModel
from keras_tuner.tuners import RandomSearch

# LOAD FILES

In [36]:
# Define file path for 'master_resultsStd.csv'
master_results_file_path = os.path.join(subdirectory, 'master_resultsStd.csv')

# Check if the file exists and load it
if os.path.exists(master_results_file_path):
    master_resultsStd = pd.read_csv(master_results_file_path)
    print(f"File 'master_resultsStd.csv' loaded successfully!")
    print(master_resultsStd)  # Display the contents of the DataFrame
else:
    print(f"File 'master_resultsStd.csv' not found in '{subdirectory}'. Please check the file path.")

File 'master_resultsStd.csv' loaded successfully!
       Target                        Model  Train R²   Test R²           MAE  \
0   new_price            Linear Regression  0.963865  0.963759  1.388949e-01   
1   new_price      Random Forest Regressor  0.999990  0.999926  4.155748e-03   
2   new_price            XGBoost Regressor  0.999799  0.999632  1.008896e-02   
3   new_price  Gradient Boosting Regressor  0.999693  0.999640  1.437758e-02   
4          PC            Linear Regression  0.973660  0.974082  1.317771e-01   
5          PC      Random Forest Regressor  1.000000  1.000000  1.665155e-14   
6          PC            XGBoost Regressor  0.999957  0.999957  6.098938e-03   
7          PC  Gradient Boosting Regressor  1.000000  1.000000  1.221103e-08   
8   new_price                    Base LSTM  0.996020  0.995922  1.025696e+05   
9          PC                    Base LSTM  0.998868  0.998890  5.939950e-01   
10  new_price                Enhanced LSTM  0.996919  0.996889  9.0122

In [None]:
master_resultsStd.head(20)

Unnamed: 0,Target,Model,Train R²,Test R²,MAE,MSE,RMSE,MAPE,Comments
0,new_price,Linear Regression,0.963865,0.963759,0.1388949,0.03641067,0.1908158,68.35787,Normal
1,new_price,Random Forest Regressor,0.99999,0.999926,0.004155748,7.470913e-05,0.008643444,2.233156,Normal
2,new_price,XGBoost Regressor,0.999799,0.999632,0.01008896,0.0003698089,0.01923042,3.287954,Normal
3,new_price,Gradient Boosting Regressor,0.999693,0.99964,0.01437758,0.0003613782,0.01900995,10.26344,Normal
4,PC,Linear Regression,0.97366,0.974082,0.1317771,0.02591456,0.16098,29.44177,Normal
5,PC,Random Forest Regressor,1.0,1.0,1.665155e-14,4.991350000000001e-28,2.234133e-14,1.707175e-12,Normal
6,PC,XGBoost Regressor,0.999957,0.999957,0.006098938,4.334612e-05,0.006583777,0.6560287,Normal
7,PC,Gradient Boosting Regressor,1.0,1.0,1.221103e-08,2.160188e-16,1.469758e-08,3.735057e-06,Normal


In [None]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the file path in Google Drive
file_path = '/content/drive/My Drive/Colab Notebooks/M5 Code and Data/Std_Datasetv5.csv'

# Read the file into a DataFrame
try:
    merged_dataset = pd.read_csv(file_path)
    print("[INFO] File loaded successfully into 'merged_dataset'.")
    print(merged_dataset.head())  # Display the first 5 rows
except FileNotFoundError:
    print(f"[ERROR] File not found at '{file_path}'. Please check the file path and try again.")
except Exception as e:
    print(f"[ERROR] An error occurred while reading the file: {e}")

Mounted at /content/drive
[INFO] File loaded successfully into 'merged_dataset'.
   saleable_area(ft^2)  unit_rate  floor  CPI   PC  IR  MW  M3  SD  UR  ...  \
0                  423      15792   22.0    0  351   3  18  17   0   3  ...   
1                  761      15857   12.0    0  351   3  18  17   0   3  ...   
2                  320      13717   12.0    0  351   3  18  17   0   3  ...   
3                  519      16541   29.0    0  351   3  18  17   0   3  ...   
4                  699      14721   24.0    0  351   3  18  17   0   3  ...   

   floor_height_floor_16to18  floor_height_floor_19to20  \
0                          0                          0   
1                          0                          0   
2                          0                          0   
3                          0                          0   
4                          0                          0   

   floor_height_floor_21to25  floor_height_floor_26to30  \
0                          1  

In [34]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72663 entries, 0 to 72662
Data columns (total 71 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   saleable_area(ft^2)                             72663 non-null  int64  
 1   unit_rate                                       72663 non-null  int64  
 2   floor                                           72663 non-null  float64
 3   CPI                                             72663 non-null  int64  
 4   PC                                              72663 non-null  int64  
 5   IR                                              72663 non-null  int64  
 6   MW                                              72663 non-null  int64  
 7   M3                                              72663 non-null  int64  
 8   SD                                              72663 non-null  int64  
 9   UR                                     

# HYPERPARAMETER TUNING

In [None]:
# Define the feature matrix (X) and target variables (y)

features = merged_dataset.drop(columns=['new_price', 'PC'])
targets_Std = merged_dataset[['new_price', 'PC']]

X = features.copy()
y = targets_Std.copy()

In [35]:
import os
import pandas as pd
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from keras_tuner import RandomSearch
import numpy as np
import tensorflow as tf

# File paths
subdirectory = '/content/drive/My Drive/Colab Notebooks/M5 Code and Data'
master_results_file_path = os.path.join(subdirectory, 'master_resultsStd.csv')

# Load or initialize master results table
if os.path.exists(master_results_file_path):
    master_resultsStd = pd.read_csv(master_results_file_path)
    print(f"[INFO] Loaded existing master results table from '{master_results_file_path}'.")
else:
    master_resultsStd = pd.DataFrame(columns=["Target", "Model", "Type", "Train R²", "Test R²", "MAE", "MSE", "RMSE", "MAPE", "Comments"])
    print(f"[INFO] Initialized a new master results table.")

# Define features and target variables
features = merged_dataset.drop(columns=['new_price', 'PC'])
targets_Std = merged_dataset[['new_price', 'PC']]
X = features.copy()
y = targets_Std.copy()

# Normalize features and targets
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

target_scalers = {col: StandardScaler() for col in targets_Std.columns}
y_scaled = pd.DataFrame({col: target_scalers[col].fit_transform(targets_Std[[col]]).flatten() for col in targets_Std.columns})

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Reshape for LSTM
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Subset the dataset for faster tuning
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42)
X_train_sample_lstm = X_train_sample.reshape((X_train_sample.shape[0], 1, X_train_sample.shape[1]))

# Define the model-building function for Keras Tuner
def build_model(hp):
    model = Sequential()
    num_layers = hp.Int('num_layers', min_value=1, max_value=3, step=1)

    for i in range(num_layers):
        return_sequences = i < (num_layers - 1)
        model.add(LSTM(
            units=hp.Int(f'units_layer_{i+1}', min_value=32, max_value=128, step=32),
            activation='relu',
            return_sequences=return_sequences,
            input_shape=(X_train_sample_lstm.shape[1], X_train_sample_lstm.shape[2]) if i == 0 else None
        ))
        model.add(Dropout(rate=hp.Choice(f'dropout_layer_{i+1}', values=[0.2, 0.3, 0.4])))

    model.add(Dense(1))
    model.compile(optimizer=tf.keras.optimizers.Adam(
        learning_rate=hp.Choice('learning_rate', values=[1e-3, 1e-4])
    ), loss='mse')
    return model

# Initialize tuner
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=1,
    directory='hyperparam_tuning',
    project_name='lstm_hyper_tuning'
)

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Loop through targets (new_price and PC) to dynamically update results
for target_name in ["new_price", "PC"]:
    if target_name not in y_train.columns:
        raise KeyError(f"Target column '{target_name}' not found in y_train. Available columns are: {y_train.columns.tolist()}")

    # Train and evaluate the model for the current target
    y_train_target_sample = y_train_sample[[target_name]].values

    # Hyperparameter tuning
    tuner.search(
        X_train_sample_lstm,
        y_train_target_sample,
        validation_split=0.1,
        epochs=20,
        batch_size=64,
        callbacks=[early_stopping]
    )

    # Build and train the best model
    best_hps = tuner.get_best_hyperparameters(1)[0]
    best_model = tuner.hypermodel.build(best_hps)
    history = best_model.fit(
        X_train_lstm,
        y_train[[target_name]].values,
        validation_data=(X_test_lstm, y_test[[target_name]].values),
        epochs=20,
        batch_size=64,
        callbacks=[early_stopping]
    )

    # Evaluate the model
    train_predictions = best_model.predict(X_train_lstm)
    test_predictions = best_model.predict(X_test_lstm)

    scaler_y = target_scalers[target_name]
    train_predictions_rescaled = scaler_y.inverse_transform(train_predictions)
    test_predictions_rescaled = scaler_y.inverse_transform(test_predictions)

    y_train_rescaled = scaler_y.inverse_transform(y_train[[target_name]])
    y_test_rescaled = scaler_y.inverse_transform(y_test[[target_name]])

    train_r2 = r2_score(y_train_rescaled, train_predictions_rescaled)
    test_r2 = r2_score(y_test_rescaled, test_predictions_rescaled)
    mae = mean_absolute_error(y_test_rescaled, test_predictions_rescaled)
    mse = mean_squared_error(y_test_rescaled, test_predictions_rescaled)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test_rescaled - test_predictions_rescaled) / y_test_rescaled)) * 100

    # Add results to the master results table
    new_results = {
        "Target": target_name,
        "Model": "Hyper-Tuned LSTM",
        "Type": "Neural Network",
        "Train R²": train_r2,
        "Test R²": test_r2,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "MAPE": mape,
        "Comments": "Working Well" if test_r2 > 0.75 and mape < 10 else "Needs Improvement"
    }

    # Convert new_results to a DataFrame
    new_results_df = pd.DataFrame([new_results])

    # Check if the target and model already exist in the master results table
    existing_row_index = master_resultsStd[
        (master_resultsStd["Target"] == new_results["Target"]) & (master_resultsStd["Model"] == new_results["Model"])
    ].index

    if len(existing_row_index) > 0:
        # Update the existing row by replacing its values
        master_resultsStd.loc[existing_row_index, :] = new_results_df.iloc[0].values
    else:
        # Append the new results
        master_resultsStd = pd.concat([master_resultsStd, new_results_df], ignore_index=True)

# Save updated results
master_resultsStd.to_csv(master_results_file_path, index=False)
print(f"[INFO] Updated master results saved to '{master_results_file_path}'.")

# Display the final results table
print("\nFinal Master Results Table:")
print(master_resultsStd.to_string(index=False))

[INFO] Loaded existing master results table from '/content/drive/My Drive/Colab Notebooks/M5 Code and Data/master_resultsStd.csv'.
Reloading Tuner from hyperparam_tuning/lstm_hyper_tuning/tuner0.json


  super().__init__(**kwargs)


Epoch 1/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.2868 - val_loss: 0.0118
Epoch 2/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 0.0315 - val_loss: 0.0041
Epoch 3/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.0216 - val_loss: 0.0031
Epoch 4/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 0.0187 - val_loss: 0.0032
Epoch 5/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.0170 - val_loss: 0.0042
Epoch 6/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 0.0162 - val_loss: 0.0047
[1m1817/1817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Epoch 1/20


  super().__init__(**kwargs)


[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.1729 - val_loss: 0.0030
Epoch 2/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 0.0139 - val_loss: 0.0016
Epoch 3/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.0112 - val_loss: 5.8642e-04
Epoch 4/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - loss: 0.0104 - val_loss: 8.3822e-04
Epoch 5/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - loss: 0.0098 - val_loss: 5.6581e-04
Epoch 6/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 0.0099 - val_loss: 6.9066e-04
Epoch 7/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 0.0097 - val_loss: 4.3088e-04
Epoch 8/20
[1m909/909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - loss: 0.0096 - val_loss: 5.9018e-04
Epoch 9/20
[1m909/9

# BEST HYPER TUNED ARCHITECTURE

In [37]:
# Display the best architecture from the hyperparameter tuner
print("[INFO] Best Hyperparameters for the Tuned LSTM Model:")
print(f"Number of Layers: {best_hps.get('num_layers')}")

for i in range(best_hps.get('num_layers')):
    units = best_hps.get(f'units_layer_{i+1}')
    dropout = best_hps.get(f'dropout_layer_{i+1}')
    print(f"Layer {i+1}: {units} units, Dropout rate: {dropout}")

learning_rate = best_hps.get('learning_rate')
print(f"Learning Rate: {learning_rate}")

# Alternatively, show a summary of the best model architecture
print("\n[INFO] Best Model Architecture:")
best_model.summary()

[INFO] Best Hyperparameters for the Tuned LSTM Model:
Number of Layers: 2
Layer 1: 96 units, Dropout rate: 0.3
Layer 2: 64 units, Dropout rate: 0.2
Learning Rate: 0.001

[INFO] Best Model Architecture:


# SAVING THE RESULTS

In [38]:
import os
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the subdirectory path
subdirectory = '/content/drive/My Drive/Colab Notebooks/M5 Code and Data'

# Ensure the subdirectory exists
os.makedirs(subdirectory, exist_ok=True)

# Define file path for saving the DataFrame as 'Std_Datasetv6.csv'
file_path = os.path.join(subdirectory, 'Std_Datasetv6.csv')

# Save the DataFrame to the file path
try:
    merged_dataset.to_csv(file_path, index=False)
    print(f"[INFO] File 'Std_Datasetv6.csv' saved successfully at '{file_path}'.")
except NameError:
    print("[ERROR] The DataFrame 'merged_dataset' does not exist. Please define it before saving.")
except Exception as e:
    print(f"[ERROR] An error occurred while saving the file: {e}")

Mounted at /content/drive
[INFO] File 'Std_Datasetv6.csv' saved successfully at '/content/drive/My Drive/Colab Notebooks/M5 Code and Data/Std_Datasetv6.csv'.


In [39]:
print(master_resultsStd.head(20))
print(master_resultsStd.shape)

       Target                        Model  Train R²   Test R²           MAE  \
0   new_price            Linear Regression  0.963865  0.963759  1.388949e-01   
1   new_price      Random Forest Regressor  0.999990  0.999926  4.155748e-03   
2   new_price            XGBoost Regressor  0.999799  0.999632  1.008896e-02   
3   new_price  Gradient Boosting Regressor  0.999693  0.999640  1.437758e-02   
4          PC            Linear Regression  0.973660  0.974082  1.317771e-01   
5          PC      Random Forest Regressor  1.000000  1.000000  1.665155e-14   
6          PC            XGBoost Regressor  0.999957  0.999957  6.098938e-03   
7          PC  Gradient Boosting Regressor  1.000000  1.000000  1.221103e-08   
8   new_price                    Base LSTM  0.996020  0.995922  1.025696e+05   
9          PC                    Base LSTM  0.998868  0.998890  5.939950e-01   
10  new_price                Enhanced LSTM  0.996919  0.996889  9.012282e+04   
11         PC                Enhanced LS

In [40]:
import os
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the subdirectory path
subdirectory = '/content/drive/My Drive/Colab Notebooks/M5 Code and Data'

# Ensure the subdirectory exists
os.makedirs(subdirectory, exist_ok=True)

# Define file path for saving the DataFrame
file_path = os.path.join(subdirectory, 'master_resultsStd.csv')  # Change the filename

# Save the DataFrame to the file path
try:
    master_resultsStd.to_csv(file_path, index=False)
    print(f"[INFO] File 'master_resultsStd.csv' saved successfully at '{file_path}'.")
except NameError:
    print("[ERROR] The DataFrame 'master_resultsStd' does not exist. Please define it before saving.")
except Exception as e:
    print(f"[ERROR] An error occurred while saving the file: {e}")

Mounted at /content/drive
[INFO] File 'master_resultsStd.csv' saved successfully at '/content/drive/My Drive/Colab Notebooks/M5 Code and Data/master_resultsStd.csv'.


In [41]:
merged_dataset.shape

(72663, 71)

In [42]:
master_resultsStd.head(20)

Unnamed: 0,Target,Model,Train R²,Test R²,MAE,MSE,RMSE,MAPE,Comments,Type
0,new_price,Linear Regression,0.963865,0.963759,0.1388949,0.03641067,0.1908158,68.35787,Normal,Simple Linear Regression
1,new_price,Random Forest Regressor,0.99999,0.999926,0.004155748,7.470913e-05,0.008643444,2.233156,Normal,Decision Tree Model
2,new_price,XGBoost Regressor,0.999799,0.999632,0.01008896,0.0003698089,0.01923042,3.287954,Normal,Decision Tree Model
3,new_price,Gradient Boosting Regressor,0.999693,0.99964,0.01437758,0.0003613782,0.01900995,10.26344,Normal,Decision Tree Model
4,PC,Linear Regression,0.97366,0.974082,0.1317771,0.02591456,0.16098,29.44177,Normal,Simple Linear Regression
5,PC,Random Forest Regressor,1.0,1.0,1.665155e-14,4.991350000000001e-28,2.234133e-14,1.707175e-12,Normal,Decision Tree Model
6,PC,XGBoost Regressor,0.999957,0.999957,0.006098938,4.334612e-05,0.006583777,0.6560287,Normal,Decision Tree Model
7,PC,Gradient Boosting Regressor,1.0,1.0,1.221103e-08,2.160188e-16,1.469758e-08,3.735057e-06,Normal,Decision Tree Model
8,new_price,Base LSTM,0.99602,0.995922,102569.6,18791530000.0,137082.2,1.578098,Working Well,Neural Network
9,PC,Base LSTM,0.998868,0.99889,0.593995,0.5510506,0.7423278,0.1626587,Working Well,Neural Network
