## Loading modules

In [None]:
import os
import sys
import math
import nltk
import numpy as np
import torch
import itertools
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ElementTree
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
# find device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print("Using {} device".format(device))

## Loading dataset & pre-processing

In [None]:
class processDataset():
    def __init__(self, datasetPath, tokensMaxLength=0):
        self.uniqueClassNames = []      # [class1, class 2, class3, ...]
        self.descriptionsAndLabels = [] # [{description: xyz, label: class1}, {description: abc, label: class2}, ...]
        self.classNameToIndex = {}      # {class1: 0, class2: 1, class3: 2, ...}
        self.IndexToClassName = {}      # {0: class1, 1: class2, 2: class3, ...}
        self.uniqueTokens = {}          # {word1, word2, word3}
        self.tokenToIndex = {}          # {word1: 0, word2: 1, word3: 2, ...}
        self.indexToToken = {}          # {0: word1, 1: word2, 2: word3, ...}
        self.tokensMaxLength = tokensMaxLength
        self.allDescriptions = []       # [[word1, word2, word3], [word2, word3, word1], [word3, word1, word2], ...]
        self.allLabels = []             # [class1, class3, class3, ...]
        self.word2VecEmbeddingLen = 100
        self.word2vecModel = None
        
        # get class
        for className in os.listdir(datasetPath):
            if os.path.isdir(os.path.join(datasetPath, className)):
                self.uniqueClassNames.append(className)

        self.uniqueClassNames = set(self.uniqueClassNames)
        
        # (className -> index) & (index -> className) dict (unique)
        self.classNameToIndex = {className: i for i, className in enumerate(self.uniqueClassNames)}
        self.IndexToClassName = {i: className for i, className in enumerate(self.uniqueClassNames)}
        
        # get description
        for className in os.listdir(datasetPath):
            currClassFolder = os.path.join(datasetPath, className)
            if os.path.isdir(currClassFolder):
                for file in os.listdir(currClassFolder):
                    if file.endswith('.xml'):
                        currFile = os.path.join(currClassFolder, file)
                        tree = ElementTree.parse(currFile)
                        root = tree.getroot()
                        
                        for description in root.findall('./description/item'):
                            currDescription = description.text.strip().lower() if description.text else "" # lower case
                            self.descriptionsAndLabels.append({"description" : currDescription, "label" : className})
                            
        # description -> tokens
        tokenizerFunc = RegexpTokenizer(r'\w+|%') # keep % sign in description
        stopWordsFunc = set(stopwords.words("english"))
        lemmatizerFunc = WordNetLemmatizer()
        
        for i, j in enumerate(self.descriptionsAndLabels):
            tokensOfDescription = [
                lemmatizerFunc.lemmatize(token, pos='v')                        # step2 - store as verbs
                for token in tokenizerFunc.tokenize(j["description"])           # step1
                if token not in stopWordsFunc                                   # step3 - exclude if a stop word
            ]
            tokensOfDescription.append("<EOS>")
            self.tokensMaxLength = max(len(tokensOfDescription), self.tokensMaxLength)
            self.descriptionsAndLabels[i]["description"] = tokensOfDescription
            
        for i in self.descriptionsAndLabels:
            while len(i["description"]) < self.tokensMaxLength:
                i["description"].append("<PAD>") # make all descriptions same length
            self.allDescriptions.append(i["description"])
            self.allLabels.append(i["label"])
                
        # (token -> index) & (index -> token) dict (unique)
        self.uniqueTokens = set([token for item in self.descriptionsAndLabels for token in item["description"]])
        self.tokenToIndex = {token: i for i, token in enumerate(self.uniqueTokens)}
        self.indexToToken = {i: token for i, token in enumerate(self.uniqueTokens)}
        
        # descriptions -> word2vec embeddings
        self.word2vecModel = Word2Vec(
            sentences = self.allDescriptions,
            vector_size = self.word2VecEmbeddingLen,    # Size of embeddings
            window = 5,                                 # Context window size
            min_count = 1,                              # Include all words
            workers = 4,                                # Number of CPU threads
            epochs = 20
        )
        
        # word2vec embeddings -> descriptionEmbeddings
        self.descriptionEmbeddings = []
        for i in self.allDescriptions:
            currDescriptionEmbedding = []
            for token in i:
                currTokenEmbedding = self.word2vecModel.wv[token]
                currDescriptionEmbedding.append(currTokenEmbedding)
            self.descriptionEmbeddings.append(currDescriptionEmbedding)
            
        for i, label in enumerate(self.allLabels):
            self.allLabels[i] = self.classNameToIndex[label]        

dataset = processDataset(r"") # input dataset directory


In [None]:
train_textEmbeddings, val_textEmbeddings, train_labels, val_labels = train_test_split(
    dataset.descriptionEmbeddings, dataset.allLabels, test_size=0.2, random_state=42
)

# Convert lists to PyTorch tensors, move tensors to the device
train_textEmbeddings = torch.tensor(train_textEmbeddings, dtype=torch.float32).to(device)
val_textEmbeddings = torch.tensor(val_textEmbeddings, dtype=torch.float32).to(device)
train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)
val_labels = torch.tensor(val_labels, dtype=torch.long).to(device)

## Model Architectures

In [None]:
class FullyConnectedClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FullyConnectedClassifier, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_dim, hidden_dim)     # First fully connected layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)    # Output layer

    def forward(self, x):
        x = self.flatten(x)           # Flatten the input
        x = self.fc1(x)               # First linear layer
        x = self.relu(x)              # Apply ReLU activation
        logits = self.fc2(x)          # Second linear layer (output)
        return logits

# Initialise model
model = FullyConnectedClassifier(
    input_dim = dataset.tokensMaxLength * dataset.word2VecEmbeddingLen, # Flattened input dimension
    hidden_dim = 200,                           # Number of neurons in the hidden layer
    output_dim = len(dataset.uniqueClassNames)  # Number of output labels
).to(device)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # Pass tokenizedEmbeddings to LSTM
        _, (hiddenLastLayer, _) = self.lstm(x)  # x.shape [batch_size, seq_length, input_dim]
        logits = self.fc(hiddenLastLayer[-1])   # logits stores raw unnormalised scores
        return logits
    
# Initialise model
model = LSTMClassifier(
    embedding_dim = dataset.word2VecEmbeddingLen,   # Flattened input dimension
    hidden_dim = 200,                               # No. of neurons in layer
    num_layers = 2,                                 # Number of LSTM layers
    output_dim = len(dataset.uniqueClassNames)      # Number of output labels
).to(device)

## Training and Validation pipeline

In [None]:
# hyper-parameters
batch_sizes = [8, 16, 32, 64, 128, 264]
learning_rates = [0.01, 0.001, 0.0001]
dropout_rates = [0.1, 0.3, 0.5]
hidden_dims = [50, 100, 200, 400]
epochs = 1000
patience = 100

# Generate all combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(batch_sizes, learning_rates, dropout_rates, hidden_dims))

# Base directory where output folders will be created
base_output_dir = r"" # output directory
os.makedirs(base_output_dir, exist_ok=True)

# Global trackers for best result
maxAccuracy = 0.0
maxAccuracyFolder = None

# Iterate over hyperparameter combinations
for index, (batch_size, lr, dropout_rate, hidden_dim) in enumerate(hyperparameter_combinations):
    folder_name = f"{index + 1:03d}"  # Format as '001', '002', etc.
    new_folder_path = os.path.join(base_output_dir, folder_name)
    os.makedirs(new_folder_path, exist_ok=True)
    
    print(f"Running combination {index + 1}: Batch Size={batch_size}, LR={lr}, Dropout={dropout_rate}, Hidden Dim={hidden_dim}")
    
    # Define model
    class TunableSelfAttentionClassifier(nn.Module):
        def __init__(self, embedding_dim, hidden_dim, output_dim, dropout_rate):
            super(TunableSelfAttentionClassifier, self).__init__()
            # Define Linear transformations for Query, Key, Value
            self.query = nn.Linear(embedding_dim, hidden_dim)
            self.key = nn.Linear(embedding_dim, hidden_dim)
            self.value = nn.Linear(embedding_dim, hidden_dim)
            # Define dropout & classification layer
            self.dropout = nn.Dropout(dropout_rate)
            self.classifier = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):   # Shape: [batch_size, sequence_length, embedding_dim]
            # Compute Q, K, V matrices
            Q = self.query(x)   # Shape: [batch_size, sequence_length, hidden_dim]
            K = self.key(x)     # Shape: [batch_size, sequence_length, hidden_dim]
            V = self.value(x)   # Shape: [batch_size, sequence_length, hidden_dim]
            
            # Compute attention scores: scaled dot-product attention
            scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(Q.size(-1), dtype=torch.float32))
            
            # Apply softmax to normalise scores along the sequence_length dimension
            attention = F.softmax(scores, dim=-1) # Shape: [batch_size, sequence_length, sequence_length]
            
            # Compute weighted sum of Value using attention scores
            hidden = torch.matmul(attention, V) # Shape: [batch_size, sequence_length, hidden_dim]
            
            # Pool the hidden representations across the sequence length [mean pooling]
            pooled = hidden.mean(dim=1) # Shape: [batch_size, hidden_dim]
            
            # Apply dropout for regularisation
            pooled = self.dropout(pooled)
            
            # Pass the pooled representation through the classifier to get logits
            return self.classifier(pooled) # Shape: [batch_size, output_dim]

    # Initialise model, loss, optimiser
    model = TunableSelfAttentionClassifier(
        embedding_dim=dataset.word2VecEmbeddingLen,
        hidden_dim=hidden_dim,
        output_dim=len(dataset.uniqueClassNames),
        dropout_rate=dropout_rate
    ).to(device)

    lossFn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    best_val_accuracy = 0.0
    best_model_state = None
    best_epoch = 0
    epochs_without_improvement = 0
    best_y_pred = None
    best_y_true = None
    
    train_acc_log = []
    val_acc_log = []
    
    # Train and validate model
    for epoch in range(epochs):
        model.train()
        train_correct = 0
        train_total = 0
        
        for i in range(0, len(train_textEmbeddings), batch_size):
            Xbatch = train_textEmbeddings[i:i+batch_size].to(device)
            ybatch = train_labels[i:i+batch_size].to(device)
            
            optimizer.zero_grad()
            y_pred = model(Xbatch)
            loss = lossFn(y_pred, ybatch)
            loss.backward()
            optimizer.step()

            predictions = torch.argmax(y_pred, dim=1)
            train_correct += (predictions == ybatch).sum().item()
            train_total += ybatch.size(0)
        
        train_accuracy = 100 * train_correct / train_total

        # Validate & save outputs
        with torch.no_grad():
            xVal_device = val_textEmbeddings.to(device)
            yVal_device = val_labels.to(device)
            y_pred = model(xVal_device)

            # Calculate validation loss & accuracy
            val_loss = lossFn(y_pred, yVal_device).item()
            val_predictions = torch.argmax(y_pred, dim=1)
            val_correct = (val_predictions == yVal_device).sum().item()
            val_accuracy = 100 * val_correct / yVal_device.size(0)
            
        # Save accuracy logs
        train_acc_log.append(train_accuracy)
        val_acc_log.append(val_accuracy)
        
        print(f"Epoch {epoch + 1:03d} | Train Acc: {train_accuracy:.2f}% | Val Acc: {val_accuracy:.2f}%")
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()
            best_epoch = epoch
            epochs_without_improvement = 0
            best_y_pred = val_predictions.cpu().numpy()
            best_y_true = yVal_device.cpu().numpy()
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best epoch was {best_epoch+1} with accuracy {best_val_accuracy:.2f}%")
            break
    
    # Load best model
    model.load_state_dict(best_model_state)

    if best_val_accuracy > maxAccuracy:
        maxAccuracy = best_val_accuracy
        maxAccuracyFolder = index
                        
    # Identify labels actually appearing in the validation set
    val_predictions_cpu = val_predictions.cpu().numpy()
    yVal_cpu = yVal_device.cpu().numpy()
    val_classes_in_use = np.unique(yVal_cpu)

    # Generate & plot confusion matrix (only for labels in validation set)
    val_conf_matrix = confusion_matrix(
        yVal_cpu, 
        val_predictions_cpu, 
        labels=val_classes_in_use
    )

    # Plot confusion matrix
    plt.figure(figsize=(20,20))  # Increase figure size for large # of classes
    disp = ConfusionMatrixDisplay(
        confusion_matrix=val_conf_matrix,
        display_labels=val_classes_in_use
    )
    disp.plot(
        include_values=True, 
        cmap=plt.cm.Blues,
        xticks_rotation='vertical'
    )

    confusion_matrix_path = os.path.join(new_folder_path, "confusion_matrix.png")
    plt.savefig(confusion_matrix_path, bbox_inches='tight')
    plt.close()
    
    # Force NumPy to show the entire matrix when converting to string (no truncation)
    np.set_printoptions(threshold=sys.maxsize, linewidth=1000)
    
    # Generate classification report (only for classes in validation set)
    class_report = classification_report(
        yVal_cpu,
        val_predictions_cpu,
        labels=val_classes_in_use,
        target_names=[str(cls) for cls in val_classes_in_use],
        zero_division=0
    )
    
    # Calculate TP, FP, FN, TN for each class
    num_classes = len(val_classes_in_use)
    total_samples = val_conf_matrix.sum()
    tp_fp_fn_tn_lines = ["Per-Class TP/FP/FN/TN:"]

    for i, class_label in enumerate(val_classes_in_use):
        TP = val_conf_matrix[i, i]
        FP = val_conf_matrix[:, i].sum() - TP
        FN = val_conf_matrix[i, :].sum() - TP
        TN = total_samples - (TP + FP + FN)
        
        line = (f"Class {class_label} --> "
                f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")
        tp_fp_fn_tn_lines.append(line)

    tp_fp_fn_tn_report = "\n".join(tp_fp_fn_tn_lines)
    
    # Combine everything into a SINGLE text file
    report_text = []
    report_text.append("Hyperparameters and Results\n" + "="*40)
    report_text.append(f"Batch Size: {batch_size}")
    report_text.append(f"Learning Rate: {lr}")
    report_text.append(f"Dropout Rate: {dropout_rate}")
    report_text.append(f"Hidden Dim: {hidden_dim}")
    report_text.append(f"Validation Loss: {val_loss:.4f}")
    report_text.append(f"Validation Accuracy: {val_accuracy:.2f}%\n")
    
    # Show the global best so far
    report_text.append(f"maxAccuracy: {maxAccuracy:.2f}%")
    report_text.append(f"maxAccuracyFolder: {maxAccuracyFolder if maxAccuracyFolder else 'None'}\n")
    
    report_text.append("Training and Validation Accuracy per Epoch:\n")
    for ep in range(len(train_acc_log)):
        report_text.append(f"Epoch {ep+1}: Train Acc = {train_acc_log[ep]:.2f}%, Val Acc = {val_acc_log[ep]:.2f}%")
    report_text.append("")

    # Text version of the confusion matrix
    report_text.append("Confusion Matrix (text version):")
    report_text.append(str(val_conf_matrix) + "\n")

    # Classification report
    report_text.append("Classification Report:")
    report_text.append(class_report)
    
    # Include per-class TP, FP, FN, TN
    report_text.append(tp_fp_fn_tn_report)

    final_report = "\n".join(report_text)

    # Write it all to a single text file
    combined_report_path = os.path.join(new_folder_path, "combined_report.txt")
    with open(combined_report_path, "w") as f:
        f.write(final_report)
    
    # Save the model        
    model_path = os.path.join(new_folder_path, "model.pth")
    torch.save(best_model_state, model_path)
    
print("Grid search complete!")
print("Best accuracy overall: {:.2f}%".format(maxAccuracy))
print("Best folder overall: {}".format(maxAccuracyFolder))

Running combination 1: Batch Size=16, LR=0.001, Dropout=0.3, Hidden Dim=100
Epoch 001 | Train Acc: 30.22% | Val Acc: 31.37%
Epoch 002 | Train Acc: 36.49% | Val Acc: 39.05%
Epoch 003 | Train Acc: 44.65% | Val Acc: 43.89%
Epoch 004 | Train Acc: 48.37% | Val Acc: 44.42%
Epoch 005 | Train Acc: 49.68% | Val Acc: 47.58%
Epoch 006 | Train Acc: 50.79% | Val Acc: 47.26%
Epoch 007 | Train Acc: 52.13% | Val Acc: 50.53%
Epoch 008 | Train Acc: 52.98% | Val Acc: 50.00%
Epoch 009 | Train Acc: 53.21% | Val Acc: 50.11%
Epoch 010 | Train Acc: 54.19% | Val Acc: 52.00%
Epoch 011 | Train Acc: 55.82% | Val Acc: 52.53%
Epoch 012 | Train Acc: 55.01% | Val Acc: 52.53%
Epoch 013 | Train Acc: 55.32% | Val Acc: 52.21%
Epoch 014 | Train Acc: 55.98% | Val Acc: 54.74%
Epoch 015 | Train Acc: 56.80% | Val Acc: 55.79%
Epoch 016 | Train Acc: 57.11% | Val Acc: 55.37%
Epoch 017 | Train Acc: 57.30% | Val Acc: 53.89%
Epoch 018 | Train Acc: 57.93% | Val Acc: 55.26%
Epoch 019 | Train Acc: 56.93% | Val Acc: 56.74%
Epoch 020 | 

<Figure size 2000x2000 with 0 Axes>