In [None]:
import torch
import argparse
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
import math
import importlib
from importlib import reload

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# If you are using Colab
dir_path = "/content/drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/"
module_path = dir_path[9:].replace("/", ".")
# imports
data_module = importlib.import_module(module_path + "data_new")
prepare_data_pipeline = data_module.prepare_data_pipeline
TRAIN_DATA_PATH = data_module.TRAIN_DATA_PATH
TEST_DATA_PATH = data_module.TEST_DATA_PATH
PAD = data_module.PAD
tensor_to_sentences = data_module.tensor_to_sentences
tensor_to_labels = data_module.tensor_to_labels

In [None]:
# If you are NOT using colab
# dir_path = ""
# from data_new import (
#     prepare_data_pipeline,
#     TRAIN_DATA_PATH,
#     TEST_DATA_PATH,
#     PAD,
#     tensor_to_sentences,
#     tensor_to_labels,
# )

In [None]:
train_file_path = dir_path + "data/train.json"
test_file_path = dir_path + "data/test.json"

In [None]:
reload(data_module)

In [None]:
class SimpleRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(SimpleRNNModel, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        # Simple RNN layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # # Activation layer
        # self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        # Convert token indices to embeddings
        embedded = self.embedding(text)

        # Pass embeddings through RNN
        rnn_output, _ = self.rnn(embedded)

        # Pass RNN output through dense layer
        predictions = self.fc(rnn_output)

        # # Sigmoid activation
        # predictions = self.sigmoid(predictions)

        return predictions

In [None]:
def train(model, iterator, optimizer, criterion, device):
    """
    Training logic for an epoch
    """
    model.train()

    epoch_loss = 0

    for batch in iterator:
        sentences = batch["sentence"]
        labels = batch["label"]
        sentences, labels = sentences.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(sentences)
        # mask = (labels.sum(dim=-1) != 0).float() # this isn't right. all zero vector is also if the labels are empty. Check for padding token instead.
        mask = (sentences != PAD_IDX).float()
        mask = mask.unsqueeze(-1).expand_as(labels)

        # Convert sigmoid outputs to binary labels
        # binary_predictions = (torch.sigmoid(predictions) >= 0.5).float()

        # print(sentences.shape) # 32, 181
        # decoded_sentences = tensor_to_sentences(sentences, idx_to_word)

        # print(labels.shape) # 32, 181, 36
        # decoded_labels = tensor_to_labels(labels, idx_to_label)

        # print(predictions.shape) # 32, 181, 36
        # decoded_predictions = tensor_to_labels(binary_predictions, idx_to_label)

        # for i in range(len(decoded_sentences)):
        #     sentence = decoded_sentences[i]
        #     for j in range(len(sentence)):
        #         decoded_word = sentence[j]
        #         encoded_word = sentences[i][j]
        #         decoded_label = decoded_labels[i][j]
        #         encoded_label = labels[i][j]
        #         decoded_prediction = decoded_predictions[i][j]
        #         encoded_prediction = binary_predictions[i][j]
        #         # if len(decoded_prediction) != 0:
        #             # print(decoded_word, decoded_label, decoded_prediction)
        #         print(decoded_word, decoded_label, decoded_prediction)
        #         print(encoded_word, encoded_label, encoded_prediction)
        #     print()
        #     break
        # foo()


        # Compute loss (takes logits as input)
        loss = criterion(
            predictions,
            labels
        )
        # loss = criterion(
        #     predictions.view(-1, predictions.shape[-1]),
        #     labels.view(-1, labels.shape[-1]),
        # )
        loss = (loss * mask).mean()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [94]:
def evaluate(model, iterator, criterion, device):
    """
    Evaluation logic with micro-F1 score
    """
    model.eval()

    epoch_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in iterator:
            sentences = batch["sentence"]
            labels = batch["label"]
            sentences, labels = sentences.to(device), labels.to(device)

            predictions = model(sentences)

            # Convert sigmoid outputs to binary labels
            binary_predictions = (torch.sigmoid(predictions) >= 0.5).float()
            sigmoid_predictions = torch.sigmoid(predictions)

            print(sentences.shape) # 32, 181
            decoded_sentences = tensor_to_sentences(sentences, idx_to_word)

            print(labels.shape) # 32, 181, 36
            decoded_labels = tensor_to_labels(labels, idx_to_label)

            print(predictions.shape) # 32, 181, 36
            decoded_predictions = tensor_to_labels(binary_predictions, idx_to_label)

            for i in range(len(decoded_sentences)):
                sentence = decoded_sentences[i]
                for j in range(len(sentence)):
                    decoded_word = sentence[j]
                    encoded_word = sentences[i][j]
                    decoded_label = decoded_labels[i][j]
                    encoded_label = labels[i][j].cpu().numpy().tolist()
                    decoded_prediction = decoded_predictions[i][j]
                    encoded_prediction = sigmoid_predictions[i][j].cpu().numpy().tolist()
                    if decoded_word != PAD:
                        print(decoded_word, decoded_label, decoded_prediction)
                        print(f"Encoded Word: {encoded_word.item()}")
                        print(f"Encoded Label:      {encoded_label}")
                        print(f"Encoded Prediction: {[round(x, 2) for x in encoded_prediction]}")
                print()
            foo()

            all_predictions.append(binary_predictions.view(-1).cpu().numpy())
            all_labels.append(labels.view(-1).cpu().numpy())

            # this loss is not necessary
            loss = criterion(
                predictions,
                labels
            )
            loss = loss.mean()
            epoch_loss += loss.item()

    # Compute micro-F1 score
    micro_f1 = f1_score(
        np.hstack(all_labels), np.hstack(all_predictions), average="micro"
    )

    return epoch_loss / len(iterator), micro_f1

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
print(device)

cuda


In [None]:
(
    train_loader,
    val_loader,
    test_loader,
    MAX_LENGTH,
    word_to_idx,
    idx_to_word,
    label_to_idx,
    idx_to_label,
) = prepare_data_pipeline(train_file_path, test_file_path)

In [None]:
config = {}
VOCAB_SIZE = len(word_to_idx)
config["embedding_dim"] = 100
config["hidden_dim"] = 128
config["epochs"] = 100
config["lr"] = 0.001
OUTPUT_DIM = len(label_to_idx)  # Number of labels
PAD_IDX = word_to_idx[PAD]

model = SimpleRNNModel(
    VOCAB_SIZE, config["embedding_dim"], config["hidden_dim"], OUTPUT_DIM, PAD_IDX
)
model = model.to(device)
criterion = nn.BCEWithLogitsLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), config["lr"])

for epoch in range(config["epochs"]):
    train_loss = train(
        model, train_loader, optimizer, criterion, device
    )
    # val_loss, micro_f1 = evaluate(model, val_loader, criterion, device)

    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.3f}")
    # print(f"\t Val. Loss: {val_loss:.3f}")
    # print(f"\t Micro-F1 Score (Val): {micro_f1:.3f}")

test_loss, test_micro_f1 = evaluate(model, test_loader, criterion, device)
print(f"\nFinal Test Loss: {test_loss:.3f}")
print(f"Micro-F1 Score (Test): {test_micro_f1:.3f}")

Epoch: 01
	Train Loss: 0.049
Epoch: 02
	Train Loss: 0.013
Epoch: 03
	Train Loss: 0.010
Epoch: 04
	Train Loss: 0.009
Epoch: 05
	Train Loss: 0.009
Epoch: 06
	Train Loss: 0.008
Epoch: 07
	Train Loss: 0.008
Epoch: 08
	Train Loss: 0.008
Epoch: 09
	Train Loss: 0.008
Epoch: 10
	Train Loss: 0.008
Epoch: 11
	Train Loss: 0.008
Epoch: 12
	Train Loss: 0.008
Epoch: 13
	Train Loss: 0.008
Epoch: 14
	Train Loss: 0.008
Epoch: 15
	Train Loss: 0.007
Epoch: 16
	Train Loss: 0.007
Epoch: 17
	Train Loss: 0.007
Epoch: 18
	Train Loss: 0.007
Epoch: 19
	Train Loss: 0.007
Epoch: 20
	Train Loss: 0.007
Epoch: 21
	Train Loss: 0.006
Epoch: 22
	Train Loss: 0.006
Epoch: 23
	Train Loss: 0.006
Epoch: 24
	Train Loss: 0.006
Epoch: 25
	Train Loss: 0.006
Epoch: 26
	Train Loss: 0.006
Epoch: 27
	Train Loss: 0.006
Epoch: 28
	Train Loss: 0.007
Epoch: 29
	Train Loss: 0.007
Epoch: 30
	Train Loss: 0.006
Epoch: 31
	Train Loss: 0.006
Epoch: 32
	Train Loss: 0.005
Epoch: 33
	Train Loss: 0.005
Epoch: 34
	Train Loss: 0.005
Epoch: 35
	Tra

NameError: ignored

In [95]:
evaluate(model, val_loader, criterion, device)

torch.Size([32, 181])
torch.Size([32, 181, 36])
torch.Size([32, 181, 36])
Epub [] []
Encoded Word: 278
Encoded Label:      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Encoded Prediction: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2018 [] []
Encoded Word: 3102
Encoded Label:      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Encoded Prediction: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Jul [] []
Encoded Word: 899
Encoded Label:      [0.0, 0.0, 0.0,

NameError: ignored