# Assignment 2 - Task 2: Model Tuning

This notebook tunes hyperparameters for two models using Keras Tuner and the existing processed splits:
- Deep Neural Network (tabular)
- GRU sequence model

We search on the train/validation sets and keep the test set untouched for final evaluation.


In [1]:
# Setup
from pathlib import Path
import os
import json
import random

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

# Try to import Keras Tuner; fall back to None
try:
    import keras_tuner as kt  # TF >=2.3
except Exception:
    try:
        import kerastuner as kt  # legacy
    except Exception:
        kt = None
        print('Keras Tuner not available; will use simple random search instead.')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)
import joblib

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Paths
from google.colab import drive
drive.mount('/drive')
DATA_DIR = Path('/drive/My Drive/Colab Notebooks/notebooks/data/processed')
MODELS_DIR = Path('/drive/My Drive/Colab Notebooks/notebooks/models/tuned')
ARTIFACTS_DIR = Path('/drive/My Drive/Colab Notebooks/notebooks/artifacts')
MODELS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

AUTOTUNE = tf.data.AUTOTUNE


Keras Tuner not available; will use simple random search instead.
Mounted at /drive


In [2]:
# Ensure keras-tuner is available in this kernel
if 'kt' in globals() and kt is None:
    import sys, subprocess
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'keras-tuner'])
        import keras_tuner as kt  # retry
        print('Installed keras-tuner into current kernel.')
    except Exception as e:
        print('Failed to install keras-tuner:', e)


Installed keras-tuner into current kernel.


In [3]:
# Load data
train_df = pd.read_csv(DATA_DIR / 'train.csv', parse_dates=['Month'])
val_df = pd.read_csv(DATA_DIR / 'val.csv', parse_dates=['Month'])
test_df = pd.read_csv(DATA_DIR / 'test.csv', parse_dates=['Month'])

print('Shapes -> train:', train_df.shape, 'val:', val_df.shape, 'test:', test_df.shape)


Shapes -> train: (44730, 9) val: (9884, 9) test: (9918, 9)


In [4]:
# Shared feature engineering

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df['MonthSin'] = np.sin(2 * np.pi * df['MonthNum'] / 12.0)
    df['MonthCos'] = np.cos(2 * np.pi * df['MonthNum'] / 12.0)
    return df

for frame in (train_df, val_df, test_df):
    add_time_features(frame)

all_series_ids = pd.concat([
    train_df['Series ID'], val_df['Series ID'], test_df['Series ID']
], axis=0).unique()
series_ids = sorted(all_series_ids)
series_index = {sid: idx for idx, sid in enumerate(series_ids)}
max_den = max(len(series_ids) - 1, 1)
for frame in (train_df, val_df, test_df):
    frame['SeriesIndex'] = frame['Series ID'].map(series_index).astype('float32')
    frame['SeriesIndexNorm'] = frame['SeriesIndex'] / max_den

# Scalers for sequence features
turnover_scaler = StandardScaler()
year_scaler = StandardScaler()

train_df['TurnoverScaled'] = turnover_scaler.fit_transform(train_df[['Turnover']])
val_df['TurnoverScaled'] = turnover_scaler.transform(val_df[['Turnover']])
test_df['TurnoverScaled'] = turnover_scaler.transform(test_df[['Turnover']])

train_df['YearScaled'] = year_scaler.fit_transform(train_df[['Year']])
val_df['YearScaled'] = year_scaler.transform(val_df[['Year']])
test_df['YearScaled'] = year_scaler.transform(test_df[['Year']])

# Persist scalers for reuse
joblib.dump(turnover_scaler, ARTIFACTS_DIR / 'turnover_scaler.joblib')
joblib.dump(year_scaler, ARTIFACTS_DIR / 'year_scaler.joblib')


['/drive/My Drive/Colab Notebooks/notebooks/artifacts/year_scaler.joblib']

In [5]:
# Tabular DNN inputs

target_col = 'Turnover'
dnn_categorical_cols = ['State', 'Industry', 'Series ID']
dnn_numeric_cols = ['Year', 'MonthNum', 'Quarter', 'MonthSin', 'MonthCos']
dnn_feature_cols = dnn_categorical_cols + dnn_numeric_cols

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False), dnn_categorical_cols),
        ('numeric', StandardScaler(), dnn_numeric_cols),
    ]
)

X_train_dnn = preprocessor.fit_transform(train_df[dnn_feature_cols]).astype(np.float32)
X_val_dnn = preprocessor.transform(val_df[dnn_feature_cols]).astype(np.float32)
X_test_dnn = preprocessor.transform(test_df[dnn_feature_cols]).astype(np.float32)

y_train = train_df[target_col].to_numpy(np.float32)
y_val = val_df[target_col].to_numpy(np.float32)
y_test = test_df[target_col].to_numpy(np.float32)

# Save preprocessor
joblib.dump(preprocessor, ARTIFACTS_DIR / 'dnn_preprocessor.joblib')

print('DNN input dim:', X_train_dnn.shape[1])


DNN input dim: 183


In [6]:
# Sequence data builders (reuse from Task 1)
WINDOW_SIZE_DEFAULT = 12
HORIZON = 1
seq_feature_cols = ['TurnoverScaled', 'MonthSin', 'MonthCos', 'YearScaled', 'SeriesIndexNorm']
seq_target_col = 'TurnoverScaled'

def build_sequence_arrays(df: pd.DataFrame, feature_cols: list[str], target_col: str, window: int, horizon: int):
    sequences, targets, actuals = [], [], []
    for _, group in df.groupby('Series ID'):
        group = group.sort_values('Month')
        feat = group[feature_cols].to_numpy(np.float32)
        targ = group[target_col].to_numpy(np.float32)
        actual = group['Turnover'].to_numpy(np.float32)
        n = len(group) - window - horizon + 1
        if n <= 0:
            continue
        for i in range(n):
            sequences.append(feat[i:i+window])
            targets.append(targ[i+window:i+window+horizon])
            actuals.append(actual[i+window:i+window+horizon])
    if not sequences:
        return (
            np.empty((0, window, len(feature_cols)), dtype=np.float32),
            np.empty((0, horizon), dtype=np.float32),
            np.empty((0, horizon), dtype=np.float32),
        )
    return np.stack(sequences), np.stack(targets), np.stack(actuals)

X_seq_train, y_seq_train, _ = build_sequence_arrays(train_df, seq_feature_cols, seq_target_col, WINDOW_SIZE_DEFAULT, HORIZON)
X_seq_val, y_seq_val, _ = build_sequence_arrays(val_df, seq_feature_cols, seq_target_col, WINDOW_SIZE_DEFAULT, HORIZON)
X_seq_test, y_seq_test_scaled, y_seq_test_actual = build_sequence_arrays(test_df, seq_feature_cols, seq_target_col, WINDOW_SIZE_DEFAULT, HORIZON)

print('Seq shapes -> train:', X_seq_train.shape, 'val:', X_seq_val.shape, 'test:', X_seq_test.shape)


Seq shapes -> train: (42930, 12, 5) val: (8060, 12, 5) test: (8140, 12, 5)


In [7]:
# Utilities

def make_datasets(X_train, y_train, X_val, y_val, batch_size):
    train_ds = (
        tf.data.Dataset.from_tensor_slices((X_train, y_train))
        .shuffle(buffer_size=len(X_train), seed=RANDOM_SEED, reshuffle_each_iteration=True)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(AUTOTUNE)
    return train_ds, val_ds


def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    mse = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}


def save_json(obj: dict, path: Path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(obj, f, indent=2)


In [8]:
# Tuning search spaces

MAX_EPOCHS = 80
DNN_TRIALS = 30
GRU_TRIALS = 30


def build_dnn_model_hp(input_dim: int, hp: 'kt.HyperParameters') -> keras.Model:
    inputs = keras.Input(shape=(input_dim,), name='tabular_features')
    x = inputs

    num_layers = hp.Int('dnn_num_layers', 2, 4)
    for i in range(num_layers):
        units = hp.Int(f'dnn_units_{i}', min_value=64, max_value=512, step=64)
        x = keras.layers.Dense(units, activation=hp.Choice('dnn_activation', ['relu', 'gelu']))(x)
        if hp.Boolean('dnn_batchnorm', default=True):
            x = keras.layers.BatchNormalization()(x)
        dropout = hp.Float('dnn_dropout', 0.0, 0.5, step=0.1)
        if dropout > 0:
            x = keras.layers.Dropout(dropout)(x)
    outputs = keras.layers.Dense(1, name='turnover')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='DNN_Tuned')
    lr = hp.Float('dnn_lr', 1e-4, 5e-3, sampling='log')
    wd = hp.Float('dnn_weight_decay', 1e-6, 1e-3, sampling='log')
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=[
            keras.metrics.MeanSquaredError(name='mse'),
            keras.metrics.RootMeanSquaredError(name='rmse'),
            keras.metrics.MeanAbsoluteError(name='mae'),
            keras.metrics.MeanAbsolutePercentageError(name='mape'),
        ],
    )
    return model


def build_gru_model_hp(window: int, feature_dim: int, horizon: int, hp: 'kt.HyperParameters') -> keras.Model:
    inputs = keras.Input(shape=(window, feature_dim), name='sequence_features')
    x = inputs

    num_layers = hp.Int('gru_num_layers', 1, 2)
    units1 = hp.Int('gru_units1', 64, 256, step=64)
    x = keras.layers.GRU(units1, return_sequences=(num_layers > 1), kernel_initializer='glorot_uniform')(x)
    if hp.Boolean('gru_layernorm', default=True):
        x = keras.layers.LayerNormalization()(x)
    drop1 = hp.Float('gru_dropout1', 0.0, 0.5, step=0.1)
    if drop1 > 0:
        x = keras.layers.Dropout(drop1)(x)

    if num_layers > 1:
        units2 = hp.Int('gru_units2', 64, 256, step=64)
        x = keras.layers.GRU(units2, return_sequences=False, kernel_initializer='glorot_uniform')(x)
        drop2 = hp.Float('gru_dropout2', 0.0, 0.5, step=0.1)
        if drop2 > 0:
            x = keras.layers.Dropout(drop2)(x)

    outputs = keras.layers.Dense(horizon, kernel_initializer='glorot_uniform', name='scaled_turnover')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='GRU_Tuned')
    lr = hp.Float('gru_lr', 1e-4, 5e-3, sampling='log')
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=[
            keras.metrics.MeanSquaredError(name='mse'),
            keras.metrics.RootMeanSquaredError(name='rmse'),
            keras.metrics.MeanAbsoluteError(name='mae'),
            keras.metrics.MeanAbsolutePercentageError(name='mape'),
        ],
    )
    return model


In [9]:
# Run Keras Tuner for DNN (BayesianOptimization with explicit max_trials)

if kt is None:
    print('Skip Keras Tuner for DNN since not available.')
else:
    tuner = kt.BayesianOptimization(
        hypermodel=lambda hp: build_dnn_model_hp(X_train_dnn.shape[1], hp),
        objective=kt.Objective('val_loss', direction='min'),
        max_trials=DNN_TRIALS,
        seed=RANDOM_SEED,
        directory=str(ARTIFACTS_DIR / 'kt_dnn'),
        project_name='dnn_bayes',
        overwrite=True,
    )

    batch_size = 256
    train_ds, val_ds = make_datasets(X_train_dnn, y_train, X_val_dnn, y_val, batch_size)

    stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

    tuner.search(
        train_ds,
        epochs=MAX_EPOCHS,
        validation_data=val_ds,
        callbacks=[stop, reduce],
        verbose=2,
    )

    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    save_json(best_hps.values, ARTIFACTS_DIR / 'dnn_best_hyperparameters.json')

    best_model = tuner.hypermodel.build(best_hps)
    history = best_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=MAX_EPOCHS,
        callbacks=[stop, reduce],
        verbose=2,
    )

    # Evaluate and save
    dnn_eval = best_model.evaluate(tf.data.Dataset.from_tensor_slices((X_test_dnn, y_test)).batch(256), return_dict=True, verbose=0)
    dnn_preds = best_model.predict(X_test_dnn, batch_size=256, verbose=0).squeeze()
    dnn_metrics = compute_metrics(y_test, dnn_preds)
    print('DNN tuned test metrics:', dnn_metrics)

    best_model.save(MODELS_DIR / 'dnn_tuned.keras')
    save_json({'val_history': history.history, 'test_metrics': dnn_metrics}, ARTIFACTS_DIR / 'dnn_tuned_metrics.json')


Trial 30 Complete [00h 02m 14s]
val_loss: 3376.398681640625

Best val_loss So Far: 2057.726318359375
Total elapsed time: 01h 00m 35s
Epoch 1/80
175/175 - 5s - 31ms/step - loss: 9893.1016 - mae: 44.0844 - mape: 104.1819 - mse: 9893.1016 - rmse: 99.4641 - val_loss: 6911.6543 - val_mae: 52.0348 - val_mape: 47.8921 - val_mse: 6911.6543 - val_rmse: 83.1364 - learning_rate: 0.0029
Epoch 2/80
175/175 - 3s - 19ms/step - loss: 456.6261 - mae: 12.6088 - mape: 28.9106 - mse: 456.6261 - rmse: 21.3688 - val_loss: 5297.1831 - val_mae: 47.5359 - val_mape: 38.7484 - val_mse: 5297.1831 - val_rmse: 72.7818 - learning_rate: 0.0029
Epoch 3/80
175/175 - 5s - 29ms/step - loss: 333.2637 - mae: 10.1928 - mape: 18.8212 - mse: 333.2637 - rmse: 18.2555 - val_loss: 4843.5020 - val_mae: 43.8136 - val_mape: 33.7616 - val_mse: 4843.5020 - val_rmse: 69.5953 - learning_rate: 0.0029
Epoch 4/80
175/175 - 3s - 18ms/step - loss: 290.6684 - mae: 9.4815 - mape: 16.8917 - mse: 290.6684 - rmse: 17.0490 - val_loss: 3980.5718 -

In [11]:
# Run Keras Tuner for GRU (faster settings: fewer trials/epochs, cached pipeline, no retrain)

if kt is None:
    print('Skip Keras Tuner for GRU since not available.')
else:
    # Reduce total search effort for speed
    fast_trials = min(12, GRU_TRIALS)
    fast_epochs = min(30, MAX_EPOCHS)

    tuner_gru = kt.BayesianOptimization(
        hypermodel=lambda hp: build_gru_model_hp(WINDOW_SIZE_DEFAULT, X_seq_train.shape[2], HORIZON, hp),
        objective=kt.Objective('val_loss', direction='min'),
        max_trials=fast_trials,
        seed=RANDOM_SEED,
        directory=str(ARTIFACTS_DIR / 'kt_gru'),
        project_name='gru_bayes',
        overwrite=True,
    )

    # Choose batch size based on device capability
    try:
        has_gpu = len(tf.config.list_physical_devices('GPU')) > 0
    except Exception:
        has_gpu = False
    batch_size = 256 if has_gpu else 128

    # Faster tf.data pipeline with cache + prefetch
    def make_fast_seq_datasets(Xtr, ytr, Xva, yva, bs):
        train = (tf.data.Dataset.from_tensor_slices((Xtr, ytr))
                 .shuffle(min(len(Xtr), 10000), seed=RANDOM_SEED, reshuffle_each_iteration=True)
                 .batch(bs)
                 .cache()
                 .prefetch(AUTOTUNE))
        val = (tf.data.Dataset.from_tensor_slices((Xva, yva))
               .batch(bs)
               .cache()
               .prefetch(AUTOTUNE))
        return train, val

    train_ds, val_ds = make_fast_seq_datasets(X_seq_train, y_seq_train, X_seq_val, y_seq_val, batch_size)

    # Tighter early stopping to cut epochs earlier
    stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

    tuner_gru.search(
        train_ds,
        epochs=fast_epochs,
        validation_data=val_ds,
        callbacks=[stop, reduce],
        verbose=2,
    )

    # Pick best hyperparameters
    best_hps = tuner_gru.get_best_hyperparameters(num_trials=1)[0]
    save_json(best_hps.values, ARTIFACTS_DIR / 'gru_best_hyperparameters.json')

    # Reuse the trained best model from the search to avoid a full second fit
    best_model = tuner_gru.get_best_models(num_models=1)[0]

    # Evaluate and save
    test_seq_ds = (tf.data.Dataset
                   .from_tensor_slices((X_seq_test, y_seq_test_scaled))
                   .batch(batch_size)
                   .cache()
                   .prefetch(AUTOTUNE))
    gru_eval = best_model.evaluate(test_seq_ds, return_dict=True, verbose=0)
    preds_scaled = best_model.predict(X_seq_test, batch_size=batch_size, verbose=0)
    preds = turnover_scaler.inverse_transform(preds_scaled)
    y_true = y_seq_test_actual.squeeze(axis=-1)
    y_pred = preds.squeeze(axis=-1)
    gru_metrics = compute_metrics(y_true, y_pred)
    print('GRU tuned test metrics:', gru_metrics)

    best_model.save(MODELS_DIR / 'gru_tuned.keras')
    # No history when skipping retrain; log eval metrics instead
    save_json({'val_best_hps': best_hps.values, 'test_metrics': gru_metrics, 'eval': gru_eval}, ARTIFACTS_DIR / 'gru_tuned_metrics.json')



Trial 12 Complete [00h 09m 46s]
val_loss: 0.026431001722812653

Best val_loss So Far: 0.008686594665050507
Total elapsed time: 02h 16m 45s


  saveable.load_own_variables(weights_store.get(inner_path))


GRU tuned test metrics: {'MSE': 1094.508056640625, 'RMSE': 33.08335014233935, 'MAE': 12.855664253234863, 'MAPE': 0.10264773666858673, 'R2': 0.9959864020347595}
