# BiRNN Implementation

In [1]:
import pandas as pd
import torch
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from preprocessing import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report
import numpy as np
from keras import regularizers
import kerastuner as kt
from utils import *
from metrics_plot_utils import *

[nltk_data] Downloading package punkt_tab to /Users/sara/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  import kerastuner as kt


In [None]:
import os
import sys

original_sys_path = sys.path.copy()
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../general_utils'))
sys.path.append(parent_dir)

from preprocessing import *
from nn_utils import *
sys.path = original_sys_path

# Yangswei_85

To use the Yangswei_85 dataset, run the cell below.

In [2]:
# Load training set
df_train = pd.read_csv('data/train_yangswei_85.csv')  
# Load test set
test_df = pd.read_csv('data/test_yangswei_85.csv')
dataset_name = 'Yangswei_85'

# T5 
To use the T5 dataset, run the cell below.

In [5]:
# Load training set
df_train = pd.read_csv('data/train_t5.csv')  
# Load test set
test_df = pd.read_csv('data/test_t5.csv')
dataset_name = 'T5'

# Training and Validation

## Training Data Preprocessing

Firstly, we apply preprocessing (fixing contractions, setting all text to lowercase, removing non-alphanumeric characters) to the training text.

Secondly, we tokenize and pad the training text using Keras' Tokenize, introducing a <UNK> token for unknown entries of the vocabulary.

Lastly, we encode the labels of the training dataset and save them to a npy file called 'label_classes.npy'.

In [3]:
#Preprocess training data
df_train['text'].apply(preprocess).to_frame()


# Tokenize and pad training data
padded_sequences, tokenizer, vocab_size = tokenize_and_pad(df_train[['text']])

# Set training data
padded_train_data = padded_sequences

#Encode labels and save classes in npy file
label_encoder = LabelEncoder()
label_encoder.fit(df_train['label'])
np.save('data/label_classes.npy', label_encoder.classes_)
train_labels = torch.tensor(label_encoder.transform(df_train['label']),dtype=torch.long)
num_classes = len(label_encoder.classes_)
train_labels_one_hot_encoded = tf.keras.utils.to_categorical(train_labels)

## Tune Hyperparameters

In [None]:
def build_model(hp):
    model = Sequential()
    # Embedding layer with tunable output dimension
    model.add(Embedding(input_dim=vocab_size, output_dim=hp.Choice('output_dim', [64, 128])))
    # Adjustable number of LSTM layers
    num_layers = hp.Int('num_layers', min_value=1, max_value=2, step=1)
    for i in range(num_layers):
        units = hp.Int(f'lstm_units_{i}', min_value=32, max_value=128, step=32)
        # Add Bidirectional LSTM layer
        model.add(
            Bidirectional(
                LSTM(
                    units, 
                    return_sequences=(i < num_layers - 1),
                    kernel_regularizer=regularizers.l2(1e-3)
                )
            )
        )
        
        # Dropout for regularization
        model.add(Dropout(hp.Float(f'dropout_rate_{i}', min_value=0.2, max_value=0.5, step=0.1)))
    # Output layer for classification
    model.add(Dense(num_classes, activation='softmax'))
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=25,  
    executions_per_trial=2
)

tuner.search(padded_train_data, train_labels_one_hot_encoded, epochs=7, validation_split=0.2, batch_size=32)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters:", best_hps.values)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
64                |64                |output_dim
2                 |2                 |num_layers
32                |32                |lstm_units_0
0.4               |0.4               |dropout_rate_0

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

## Train and Validate best model

In [None]:
print("Start Training and Validation:\n")
epochs = 50
patience = 15
batch_size = 64


rnn_model = build_model(best_hps)
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('models/best_birnn_model.h5', monitor='val_loss', save_best_only=True, mode='min', verbose=1)

history = rnn_model.fit(
    padded_train_data, 
    train_labels_one_hot_encoded, 
    epochs=epochs, 
    batch_size=batch_size, 
    validation_split=0.2, 
    callbacks=[early_stopping, model_checkpoint], 
    verbose=1
)
# Extract training and validation losses
train_losses = history.history['loss']
val_losses = history.history['val_loss']
actual_epochs =  len(train_losses)
# Plot training and validation losses
plot_losses("BiRNN", dataset_name, train_losses, val_losses, actual_epochs)


# Test

## Test Data Preprocessing

In [None]:
# Preprocess test data
test_df['text'].apply(preprocess).to_frame()  

# Tokenize and pad test data using the same tokenizer from training
padded_test_sequences = tokenizer.texts_to_sequences(test_df['text'])
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(padded_test_sequences, maxlen=padded_train_data.shape[1])

# Tokenize and pad test data
padded_sequences, _, _ = tokenize_and_pad(test_df[['text']])

# Set test data
test_data = padded_sequences

#Encode test labels by loading encoder used for training labels
true_labels = test_df['label']
label_classes = np.load('data/label_classes.npy', allow_pickle=True)
label_encoder = LabelEncoder()
label_encoder.classes_ = label_classes
test_labels = torch.tensor(label_encoder.transform(true_labels))
test_labels_one_hot_encoded = tf.keras.utils.to_categorical(test_labels, num_classes=num_classes)

## Test best model on test data

In [None]:

# Load the saved model
model = tf.keras.models.load_model('models/best_birnn_model.h5')


test_loss, test_accuracy = model.evaluate(padded_test_sequences, test_labels_one_hot_encoded, verbose=1)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


predictions_probabilities = model.predict(padded_test_sequences)
predicted_classes = label_encoder.inverse_transform(np.argmax(predictions_probabilities, axis=1))
true_classes = label_encoder.inverse_transform(np.argmax(test_labels_one_hot_encoded, axis=1))

## Compute metrics on model performance

In [None]:

# Compute metrics
metrics = compute_metrics(predicted_classes, true_classes)
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print('\n')
print('Macro Metrics')
print(f"Macro Precision: {metrics['precision']:.4f}")
print(f"Macro Recall: {metrics['recall']:.4f}")
print(f"Macro F1 Score: {metrics['f1']:.4f}")
print('\n')
print('Weighted Metrics')
print(f"Weighted Precision: {metrics['precision_weighted']:.4f}")
print(f"Weighted Recall: {metrics['recall_weighted']:.4f}")
print(f"Weighted F1 Score: {metrics['f1_weighted']:.4f}")

# Plot confusion matrix
plot_confusion_matrix(true_classes, predicted_classes, label_classes)

In [None]:

# Print classification report
print('Classification Report:\n')
print(classification_report(true_classes, predicted_classes, target_names=label_classes))
