In [None]:
# Standard library imports
import io
import json
import logging
import os
import time

# Related third party imports
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.model_selection import ParameterGrid
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import Callback, EarlyStopping
from tensorflow.keras.layers import BatchNormalization, Bidirectional, Conv1D, Dense, Dropout, Embedding, GlobalMaxPooling1D, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.regularizers import l1

### Data Preprocessing

1. Load the embedding matrix and tokenizer. Tokenise the data using the tokeniser

2. Convert text data from three different data frames (train_df, dev_df, and test_df) into sequences of integers. Each word in the text is replaced by its corresponding index from the word index created earlier.

3. Set a maximum sequence length (max_seq_length) and then pad the sequences to ensure that they all have the same length. \
    a. Padding is done with zeros (`0.`)

4. Create a label encoder by mapping unique labels in the 'label-coarse' column of the training data frame to integers.

5. Encode the labels for the training, development, and test data sets using this label encoder

In [None]:
# Load the reduced embedding matrix
embedding_matrix = np.load('models/embedding_matrix.npy')
vocab_size, embedding_size = embedding_matrix.shape

# Load saved tokenizer
with open('models/tokenizer.json') as f:
    tokenizer_data = json.load(f)
    tokenizer = tokenizer_from_json(tokenizer_data)

# Load dataset
train_df = pd.read_csv('TREC_dataset/train.csv')
dev_df = pd.read_csv('TREC_dataset/dev.csv')
test_df = pd.read_csv('TREC_dataset/test.csv')

# Convert text data to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences to ensure they have the same length
max_seq_length = 40  # Adjust as needed

train_data = pad_sequences(
    train_sequences, maxlen=max_seq_length, padding='post')
dev_data = pad_sequences(dev_sequences, maxlen=max_seq_length, padding='post')
test_data = pad_sequences(
    test_sequences, maxlen=max_seq_length, padding='post')

# Encode labels
label_encoder = {label: i for i, label in enumerate(train_df['label-coarse'].unique())}

train_labels = np.array([label_encoder[label]for label in train_df['label-coarse']])
dev_labels = np.array([label_encoder[label]for label in dev_df['label-coarse']])
test_labels = np.array([label_encoder[label]for label in test_df['label-coarse']])

## Bi-LSTM + CNN [FINALISED]

### Step 1: Define Hyperparameter Grid

1. Define a grid of hyperparameters to search, including:
   - `embedding_size`: Different embedding sizes.
   - `lstm_units`: Different LSTM units.
   - `batch_size`: Different batch sizes.

### Step 2: Logging and Model Directory Setup

2. Create a directory structure to store model logs.
3. Configure logging to track and save results to a log file.

### Step 3: Model Training Loop

In this step, we systematically train multiple models, each with different hyperparameter settings, to identify the best-performing configuration. The process is as follows:

#### Substep 1: Hyperparameter Iteration

4.1. Iterate through the predefined hyperparameter combinations.
   - For each combination, we explore various settings for:
     - `embedding_size`: The size of word embeddings.
     - `lstm_units`: The number of LSTM units in the bidirectional LSTM layer.
     - `batch_size`: The batch size used during training.

#### Substep 2: Model Architecture

4.2. Build the model architecture for the current hyperparameter combination. The architecture includes:
   - Embedding Layer: Converts input sequences into dense vector representations.
   - Bidirectional LSTM: A recurrent layer that captures contextual information bidirectionally.
   - Convolutional Layer: Applies convolutional operations to capture local patterns.
   - Global Max Pooling: Extracts the most relevant information from convolutional outputs.
   - Dropout: Regularization technique to prevent overfitting.
   - Dense Layers: Fully connected layers for classification.
   
#### Substep 3: Model Compilation

4.3. Compile the model with the following configurations:
   - Learning Rate Schedule: Uses an exponential decay schedule to adjust the learning rate.
   - Optimizer: Utilizes the Adam optimizer for gradient descent.
   
#### Substep 4: Early Stopping

4.4. Implement early stopping as a precautionary measure to prevent overfitting during training. Early stopping monitors the loss on the training set and stops training if the loss on the training set does not improve for a specified number of epochs.

#### Substep 5: Class Weights

4.5. Define class weights to address data imbalance issues. Class weights assign higher importance to underrepresented classes during training, helping the model better learn from imbalanced data.

#### Substep 6: Training and Validation

4.6. Train the model on the training dataset with the specified hyperparameters. During training, the model learns to make predictions based on input sequences. Validation is performed on a separate development dataset to assess the model's performance during training.

By systematically exploring different hyperparameter combinations and training models with varying configurations, we aim to identify the best-performing model with the most suitable hyperparameters for the text classification task.


### Step 4: Model Saving

5. After training, evaluate the model on the test set.
6. Save the model in a directory named based on its test accuracy (rounded to four decimal places).
7. Record additional information in a JSON file, including the model summary.
8. Log the saved model path and version.

Summary

This code demonstrates a systematic approach to hyperparameter tuning and model saving for text classification tasks, ensuring reproducibility and easy tracking of model performance.


In [None]:
# Create a directory for model logs
if not os.path.exists('model_logs'):
    os.mkdir('model_logs')

model_folder = 'BiLSTM'

# Create a directory for the current model
model_logs_dir = os.path.join('model_logs', model_folder)
if not os.path.exists(model_logs_dir):
    os.mkdir(model_logs_dir)

# Configure logging to save results to a single log file
log_filepath = os.path.join(model_logs_dir, 'model_log.txt')
logging.basicConfig(filename=log_filepath,
                    level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs={}):
        self.times.append(time.time() - self.epoch_start_time)

## Create model

In [None]:
# Define a grid of hyperparameters to search
param_grid = {
    'embedding_size': [300],    # Different embedding sizes
    'lstm_units': [32],         # Different LSTM units
    'batch_size': [128],        # Different batch sizes
}

param_combinations = list(ParameterGrid(param_grid))

# Iterate through the parameter combinations
for params in param_combinations:
    embedding_size = params['embedding_size']
    lstm_units = params['lstm_units']
    batch_size = params['batch_size']

    print(
        f"Testing hyperparameters: Embedding Size={embedding_size}, LSTM Units={lstm_units}, Batch Size={batch_size}")
    
    bi_lstm = tf.keras.Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_seq_length, trainable=False),
        Bidirectional(LSTM(lstm_units, return_sequences=True)),
        Conv1D(128, 5, activation='relu', padding='same'),              # Convolutional layer
        GlobalMaxPooling1D(),                                           # Global Max Pooling
        Dropout(0.2),                                                   # Dropout for regularization
        Dense(128, activation='relu', kernel_regularizer=l1(0.01)),     # Adding L2 regularization
        BatchNormalization(),                                           # Batch normalization layer 
        Dense(5, activation='softmax')
    ])
    
    # Compile the model with a lower initial learning rate and learning rate scheduler
    initial_learning_rate = 0.01
    
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,decay_steps=1000, decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    bi_lstm.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Define EarlyStopping callback to prevent overfitting
    early_stopping = EarlyStopping(
        monitor='loss', patience=5, restore_best_weights=True)
    time_callback = TimeHistory()

    # Train the model
    history = bi_lstm.fit(train_data, train_labels, epochs=50,
                        batch_size=batch_size,
                        validation_data=(dev_data, dev_labels),
                          callbacks=[early_stopping, time_callback],
                        )

    # Evaluate the model on the test set
    test_loss, test_accuracy = bi_lstm.evaluate(
        test_data, test_labels, verbose=2)


## Logging and Saving the Model

In [None]:
    model_path = os.path.join(model_logs_dir, f'model_{round(test_accuracy,4)}')
    
    # Save the entire model
    tf.keras.models.save_model(bi_lstm, model_path)
    
    model_info = {}

    epoch_data = {
        'epoch': list(range(1, len(history.history['accuracy']) + 1)),
        'timing': time_callback.times,
        'train_loss': history.history['loss'],
        'train_accuracy': history.history['accuracy'],
        'val_loss': history.history['val_loss'],
        'val_accuracy': history.history['val_accuracy']
    }
    
    for layer in bi_lstm.layers:
        # Layer name as the key
        layer_name = layer.name
        layer_info = {
            'class_name': layer.__class__.__name__,
            'config': layer.get_config(),  # Gets detailed configuration of the layer
            'number_of_parameters': layer.count_params()
        }
        model_info[layer_name] = layer_info
     
     
    # Now add this model_info to your 'info' dictionary
    info = {
        'Model': model_info,  # Detailed model information
        'Hyperparameters': params,
        'Test Loss': test_loss,
        'Test Accuracy': test_accuracy,
        'Epoch Data': epoch_data
    }


    info_path = os.path.join(
        model_path, f'model_info.json')
    with open(info_path, 'w') as info_file:
        json.dump(info, info_file)

    # Logging: You can log the saved model path and version
    print(f"Saved model: {model_path}")
    # Log the results in the same log file
    model_summary = []
    bi_lstm.summary(print_fn=lambda x: model_summary.append(x))
    model_architecture = "\n".join(model_summary)
    logging.info("Model Architecture:\n" + model_architecture)
    logging.info(
        f"Testing hyperparameters: Embedding Size={embedding_size}, LSTM Units={lstm_units}, Batch Size={batch_size}")
    logging.info(f"Test Loss: {test_loss}")
    logging.info(f"Test Accuracy: {test_accuracy}")

    info_path = os.path.join(
        model_path, f'model_info.json')
    with open(info_path, 'w') as info_file:
        json.dump(info, info_file)

    # Logging: You can log the saved model path and version
    print(f"Saved model: {model_path}")
    # Log the results in the same log file
    model_summary = []
    bi_lstm.summary(print_fn=lambda x: model_summary.append(x))
    model_architecture = "\n".join(model_summary)
    logging.info("Model Architecture:\n" + model_architecture)
    logging.info(
        f"Testing hyperparameters: Embedding Size={embedding_size}, LSTM Units={lstm_units}, Batch Size={batch_size}")
    logging.info(f"Test Loss: {test_loss}")
    logging.info(f"Test Accuracy: {test_accuracy}")

## Illustrate Model

In [None]:
bi_lstm  = tf.keras.models.load_model('model_logs/BiLSTM/model_0.85')

tf.keras.utils.plot_model(bi_lstm,
                          show_shapes=True,
                          show_trainable=True,
                          show_layer_activations=True,
                          show_dtype=True,
                          )