In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, load_metric
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
dataset = load_dataset("surrey-nlp/PLOD-CW")
# Vocab dictionary
word_index = {}
# Model parameters
embedding_dim = 300
hidden_dim = 300

In [3]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [4]:
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

In [5]:
def encode_label(tags):
    global label_encoding
    return [label_encoding[tag] for tag in tags]

def build_vocab(dataset):
    global  word_index
    for item in dataset['tokens']:
        for word in item:
            if word not in word_index:
                word_index[word] = len(word_index)
    return word_index

In [6]:
word_index = build_vocab(train_dataset)
word_index = build_vocab(val_dataset)
word_index = build_vocab(test_dataset)

In [7]:
# Define the LSTM model
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tags):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tags)

    def forward(self, sentence):
        embeds = self.embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.fc(lstm_out)
        return torch.log_softmax(lstm_out, dim=-1)

In [8]:
# Encode strings amd return tensor
def encode_sequence(seq, encoder=None):
    if type(seq[0]) == int:
        encoded_ids = seq
    else:
        encoded_ids = [encoder[word] for word in seq]
    return torch.tensor(encoded_ids, dtype=torch.long)

In [9]:
# Evaluate the model
def evaluate(model, token_dataset, word_index):
    global label_encoding
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for i, datapoint in enumerate(token_dataset):
            inputs = encode_sequence(datapoint['tokens'], word_index)
            targets = encode_sequence(encode_label(datapoint['ner_tags']))
            tag_scores = model(inputs.unsqueeze(0))
            
            predicted_tags = tag_scores.max(2)[1].squeeze().tolist()
            y_pred.extend(predicted_tags)
            y_true.extend(targets.tolist())    

    # Detailed performance metrics
    labels_indices = list(label_encoding.values())
    labels_names = list(label_encoding.keys())
    print(classification_report(y_true, y_pred, labels=labels_indices, target_names=labels_names))

    overall_f1 = f1_score(y_true, y_pred, average= 'macro')
    overall_precision = precision_score(y_true, y_pred, average='macro')
    overall_recall = recall_score(y_true, y_pred, average='macro')
    print(f'Overall F1 Score: {overall_f1}')
    print(f'Overall Precision Score: {overall_precision}')
    print(f'Overall Recall Score: {overall_recall}')

In [36]:
# Training the model
model = LSTM(embedding_dim, hidden_dim, len(word_index), 4)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.9)

for epoch in range(6):
    for i, datapoint in enumerate(train_dataset):
        model.zero_grad()
        sentence_in = encode_sequence(datapoint['tokens'], word_index)
        targets = encode_sequence(encode_label(datapoint['ner_tags']))
        tag_scores = model(sentence_in.unsqueeze(0))
        loss = loss_function(tag_scores.squeeze(0), targets)
        loss.backward()
        optimizer.step()
        # if i == 199:break
    
    # Model Validation
    model.eval()
    with torch.no_grad():
        for val_datapoint in val_dataset:
            val_inputs = encode_sequence(val_datapoint['tokens'], word_index)
            val_targets = encode_sequence(encode_label(val_datapoint['ner_tags']))
            val_tag_scores = model(val_inputs.unsqueeze(0))
            val_loss = loss_function(val_tag_scores.squeeze(0), val_targets)
            
    model.train()
    
# Evaluate on the test set
evaluate(model, test_dataset, word_index)

              precision    recall  f1-score   support

         B-O       0.90      0.95      0.93      4292
        B-AC       0.63      0.35      0.45       270
        B-LF       0.32      0.21      0.26       150
        I-LF       0.52      0.41      0.46       288

    accuracy                           0.87      5000
   macro avg       0.59      0.48      0.52      5000
weighted avg       0.85      0.87      0.85      5000

Overall F1 Score: 0.5218138803785223
Overall Precision Score: 0.5935364863400443
Overall Recall Score: 0.4798173065306686


In [13]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
# from datasets import load_dataset, load_metric
# import numpy as np
# from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

# dataset = load_dataset("surrey-nlp/PLOD-CW")
# # Model parameters
# embedding_dim = 300
# hidden_dim = 300

# short_dataset = dataset["train"][:200]
# train_dataset = dataset["train"]
# val_dataset = dataset["validation"]
# test_dataset = dataset["test"]

# label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

# label_list = []
# for sample in short_dataset["ner_tags"]:
#     label_list.append([label_encoding[tag] for tag in sample])

# val_label_list = []
# for sample in val_dataset["ner_tags"]:
#     val_label_list.append([label_encoding[tag] for tag in sample])

# test_label_list = []
# for sample in test_dataset["ner_tags"]:
#     test_label_list.append([label_encoding[tag] for tag in sample])


# # Step up dictionary
# word_index = {}
# def word_to_index(dataset):
#     global word_index
    
#     for item in dataset['tokens']:
#         for word in item:
#             if word not in word_index:
#                 word_index[word] = len(word_index) 


# word_to_index(train_dataset)
# word_to_index(val_dataset)
# word_to_index(test_dataset)


# # Define the LSTM model
# class LSTM(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, vocab_size, tags):
#         super(LSTM, self).__init__()
#         self.hidden_dim = hidden_dim
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, tags)

#     def forward(self, sentence):
#         embeds = self.embeddings(sentence)
#         lstm_out, _ = self.lstm(embeds)
#         lstm_out = self.fc(lstm_out)
#         return torch.log_softmax(lstm_out, dim=-1)


# # Encode strings amd return tensor
# def encode_sequence(seq, encoder=None):
#     if type(seq[0]) == int:
#         encoded_ids = seq
#     else:
#         encoded_ids = [encoder[word] for word in seq]
#     return torch.tensor(encoded_ids, dtype=torch.long)


# # Evaluate the model
# def evaluate(model, test_data, test_tags, word_index):
#     global label_encoding
#     model.eval()
#     y_true = []
#     y_pred = []
    
#     with torch.no_grad():
#         for i in range(len(test_data)):
#             inputs = encode_sequence(test_data[i], word_index)
#             targets = encode_sequence(test_tags[i])
#             tag_scores = model(inputs.unsqueeze(0))

#             predicted_tags = tag_scores.max(2)[1].squeeze().tolist()
#             y_pred.extend(predicted_tags)
#             y_true.extend(targets.tolist())    

#     # Detailed performance metrics
#     labels_indices = list(label_encoding.values())  # Numerical indices for the tags
#     labels_names = list(label_encoding.keys())      # Corresponding names for the indices
#     print(classification_report(y_true, y_pred, labels=labels_indices, target_names=labels_names))

#     overall_f1 = f1_score(y_true, y_pred, average='macro')
#     overall_precision = precision_score(y_true, y_pred, average='macro')
#     overall_recall = recall_score(y_true, y_pred, average='macro')
#     print(f'Overall F1 Score: {overall_f1}')
#     print(f'Overall Precision Score: {overall_precision}')
#     print(f'Overall Recall Score: {overall_recall}')


# # Training the model
# model = LSTM(embedding_dim, hidden_dim, len(word_index), len(label_encoding))
# loss_function = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1)

# for epoch in range(10):
#     for i in range(len(short_dataset['tokens'])):
#         model.zero_grad()
#         sentence_in = encode_sequence(short_dataset['tokens'][i], word_index)
#         targets = encode_sequence(label_list[i])
#         tag_scores = model(sentence_in.unsqueeze(0))
#         loss = loss_function(tag_scores.squeeze(0), targets)
#         loss.backward()
#         optimizer.step()

# # Evaluate on the test set
# evaluate(model, test_dataset['tokens'], test_label_list,word_index)