In [41]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
import torch.nn.functional as F 
import pandas as pd

In [2]:
MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 5
MODEL_NAME = 'bert-base-uncased'
MODEL_PATH = 'ner_model_from_final1.pth'

In [3]:
def clean_tag(tag):
    # Ensure tags are in the correct format
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

In [4]:
def read_names(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        names = [name.strip().lower() for name in file.readlines()]
    return names

In [5]:
def evaluate(model, dataloader, device, tag2idx, idx2tag):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=2)

            # Collect the predictions and true labels for calculating F1 score and accuracy
            all_preds.extend(predictions.cpu().numpy().tolist())
            all_labels.extend(batch['labels'].cpu().numpy().tolist())

    avg_loss = total_loss / len(dataloader)

    # Flatten the lists to calculate metrics
    all_preds_flat = [p for preds in all_preds for p in preds]
    all_labels_flat = [l for labels in all_labels for l in labels]

    # Remove padding tokens, the label 0 (O), and -100 for accuracy and F1 calculation
    true_preds = [pred for pred, label in zip(all_preds_flat, all_labels_flat) if label != tag2idx['PAD'] and label != tag2idx['O'] and label != -100]
    true_labels = [label for label in all_labels_flat if label != tag2idx['PAD'] and label != tag2idx['O'] and label != -100]

    # Map indices back to tags
    true_preds_tags = [idx2tag[pred] for pred in true_preds]
    true_labels_tags = [idx2tag[label] for label in true_labels]

    # Get the list of unique tags in the dataset (excluding PAD and O)
    unique_tags = [tag for tag in tag2idx if tag != 'PAD' and tag != 'O']

    f1 = f1_score(true_labels_tags, true_preds_tags, average='weighted')
    accuracy = accuracy_score(true_labels_tags, true_preds_tags)

    print(f'Average Loss: {avg_loss}')
    print(f'F1 Score (excluding PAD and O): {f1}')
    print(f'Accuracy (excluding PAD and O): {accuracy}')
    print(classification_report(true_labels_tags, true_preds_tags, labels=unique_tags, target_names=unique_tags))

    return avg_loss, f1, accuracy, true_labels_tags, true_preds_tags

In [5]:
def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

In [6]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                else:
                    labels[i] = self.tag2idx['O']
                idx += 1
            else:  # Subtoken of a word
                labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

In [7]:
#New NERDataset
class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                else:
                    labels[i] = self.tag2idx['O']
                idx += 1
            else:  # Subtoken of a word
                labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

In [7]:
character_names = read_names('./scraping_res/character_names.txt')
location_names = read_names('./scraping_res/location_names.txt')
organization_names = read_names('./scraping_res/organization_names.txt')
all_names = character_names + location_names + organization_names

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
num_added_toks = tokenizer.add_tokens(all_names)



In [10]:
train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")


In [11]:
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = dict([(value, key) for key, value in tag2idx.items()])

In [12]:
train_data = NERDataset(train_tokens, train_tags, tokenizer, MAX_LEN, tag2idx)

In [13]:
train_data[0]

{'input_ids': tensor([  101,  2023,  2338,  2003,  1048, 12098, 21500,  2100,  4986,  2007,
         31038,  1055,  1998,  2013,  2049,  5530,  1037,  1054, 31581,  4315,
          2089, 30702,  3104,  2172,  1997,  2037, 10381, 12098,  2552,  2121,
          1998,  1037,  2210,  1997,  2037,  2381,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [14]:
def revert_tokenization(item, tokenizer, idx2tag):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Use attention_mask to filter out padding tokens
    filtered_ids = input_ids[attention_mask == 1]
    filtered_labels = labels[attention_mask == 1]

    # Decode the filtered ids
    decoded_sentence = tokenizer.decode(filtered_ids, skip_special_tokens=True)

    # Decode labels
    decoded_labels = [idx2tag[label.item()] for label in filtered_labels if label != -100]

    return decoded_sentence, decoded_labels

In [15]:
item = train_data[0]
reverted_sentence, reverted_labels = revert_tokenization(item, tokenizer, idx2tag)


In [16]:
print("Original Sentence:")
print(" ".join(train_tokens[0]))
print("\nReverted Sentence:")
print(reverted_sentence)
print("\nReverted Labels:")
print(reverted_labels)

Original Sentence:
this book is largely concerned with hobbits and from its pages a reader may discover much of their character and a little of their history

Reverted Sentence:
this book is l ar gely concerned with hobbit s and from its pages a r eä der may dís cover much of their ch ar acter and a little of their history

Reverted Labels:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CHAR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [17]:
reverted_sentence

'this book is l ar gely concerned with hobbit s and from its pages a r eä der may dís cover much of their ch ar acter and a little of their history'

In [25]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                else:
                    labels[i] = self.tag2idx['O']
                idx += 1
            else:  # Subtoken of a word
                labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

def revert_tokenization(item, tokenizer, idx2tag):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Use attention_mask to filter out padding tokens
    filtered_ids = input_ids[attention_mask == 1]
    filtered_labels = labels[attention_mask == 1]

    # Decode the filtered ids
    decoded_sentence = tokenizer.decode(filtered_ids, skip_special_tokens=True)

    # Decode labels
    decoded_labels = [idx2tag[label.item()] for label in filtered_labels if label != -100]

    return decoded_sentence, decoded_labels

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = dict([(value, key) for key, value in tag2idx.items()])

MAX_LEN = 128


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_data = NERDataset(train_tokens, train_tags, tokenizer, MAX_LEN, tag2idx)
number = 1
# Check the "numeer" item in the dataset
item = train_data[number]
reverted_sentence, reverted_labels = revert_tokenization(item, tokenizer, idx2tag)

print("Original Sentence:", len(train_tokens[number]))
print(" ".join(train_tokens[number]))
print('Original Labels', len(train_tags[number]))
print(" ".join(train_tags[number]))
print("\nReverted Sentence:", len(reverted_sentence))
print(reverted_sentence)
print("\nReverted Labels:", len(reverted_labels))
print(reverted_labels)


Original Sentence: 26
further information will also be found in the selection from the red book of westmarch that has already been published under the title of the hobbit
Original Labels 26
O O O O O O O O O O O O O O B-LOC O O O O O O O O O O B-CHAR

Reverted Sentence: 148
further information will also be found in the selection from the red book of westmarch that has already been published under the title of the hobbit

Reverted Labels: 28
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CHAR', 'O']


In [14]:
def print_decoded_words_with_tags(item, tokenizer, idx2tag):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Use attention_mask to filter out padding tokens
    filtered_ids = input_ids[attention_mask == 1]
    filtered_labels = labels[attention_mask == 1]

    # Decode the filtered ids
    decoded_tokens = tokenizer.convert_ids_to_tokens(filtered_ids)

    # Map label indices back to tag names
    decoded_labels = [idx2tag[label.item()] for label in filtered_labels if label != -100]

    for token, label in zip(decoded_tokens, decoded_labels):
        print(f"{token}: {label}")

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = dict([(value, key) for key, value in tag2idx.items()])

MAX_LEN = 128


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_data = NERDataset(train_tokens, train_tags, tokenizer, MAX_LEN, tag2idx)

# Check the first item in the dataset
item = train_data[1]

# Print each word with its tag
print_decoded_words_with_tags(item, tokenizer, idx2tag)


[CLS]: O
further: O
information: O
will: O
also: O
be: O
found: O
in: O
the: O
selection: O
from: O
the: O
red: O
book: O
of: O
west: B-LOC
##mar: O
##ch: O
that: O
has: O
already: O
been: O
published: O
under: O
the: O
title: O
of: B-CHAR
the: O


In [24]:
item = train_data[0]

In [26]:
# Assuming item is the dictionary containing 'input_ids' and 'attention_mask'
input_ids = item['input_ids']
attention_mask = item['attention_mask']

# Get the length of the input_ids tensor
length_input_ids = input_ids.size(0)

# Get the length of the attention_mask tensor
length_attention_mask = attention_mask.size(0)

print("Length of input_ids tensor:", length_input_ids)
print("Length of attention_mask tensor:", length_attention_mask)


Length of input_ids tensor: 128
Length of attention_mask tensor: 128


In [27]:
def print_lengths(item):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Apply the attention mask to filter out padding tokens
    filtered_input_ids = input_ids[attention_mask == 1]
    filtered_labels = labels[attention_mask == 1]

    # Get the lengths after applying the attention mask
    length_input_ids = filtered_input_ids.size(0)
    length_labels = filtered_labels.size(0)

    print("Length of input_ids tensor after applying attention mask:", length_input_ids)
    print("Length of labels tensor after applying attention mask:", length_labels)


In [28]:
print_lengths(item)

Length of input_ids tensor after applying attention mask: 29
Length of labels tensor after applying attention mask: 29


In [28]:
from torch.utils.data import Dataset
import torch
from transformers import BertTokenizer

class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                else:
                    labels[i] = self.tag2idx['O']
                idx += 1
            else:  # Subtoken of a word
                if labels[i - 1] == self.tag2idx['B-CHAR']:
                    labels[i] = self.tag2idx.get('I-CHAR', -100)  # Assign I-CHAR if exists
                else:
                    labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

def print_decoded_words_with_tags(item, tokenizer, idx2tag):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Use attention_mask to filter out padding tokens
    filtered_ids = input_ids[attention_mask == 1]
    filtered_labels = labels[attention_mask == 1]

    # Decode the filtered ids
    decoded_tokens = tokenizer.convert_ids_to_tokens(filtered_ids)

    # Map label indices back to tag names
    decoded_labels = [idx2tag[label.item()] for label in filtered_labels if label != -100]

    for token, label in zip(decoded_tokens, decoded_labels):
        print(f"{token}: {label}")

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = dict([(value, key) for key, value in tag2idx.items()])

MAX_LEN = 128


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_data = NERDataset(train_tokens, train_tags, tokenizer, MAX_LEN, tag2idx)

# Check the first item in the dataset
item = train_data[1]

# Print each word with its tag
print_decoded_words_with_tags(item, tokenizer, idx2tag)


[CLS]: O
further: O
information: O
will: O
also: O
be: O
found: O
in: O
the: O
selection: O
from: O
the: O
red: O
book: O
of: O
west: B-LOC
##mar: O
##ch: O
that: O
has: O
already: O
been: O
published: O
under: O
the: O
title: O
of: B-CHAR
the: I-CHAR
ho: O


In [31]:
from torch.utils.data import Dataset
import torch
from transformers import BertTokenizer

class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        previous_label = self.tag2idx['O']
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                    previous_label = labels[i]
                else:
                    labels[i] = self.tag2idx['O']
                    previous_label = labels[i]
                idx += 1
            else:  # Subtoken of a word
                if previous_label == self.tag2idx['B-CHAR'] or previous_label == self.tag2idx['I-CHAR']:
                    labels[i] = self.tag2idx.get('I-CHAR', -100)
                else:
                    labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

def print_decoded_words_with_tags(item, tokenizer, idx2tag):
    input_ids = item['input_ids']
    attention_mask = item['attention_mask']
    labels = item['labels']

    # Decode the input_ids
    decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Map label indices back to tag names
    decoded_labels = [idx2tag[label.item()] if label != -100 else 'IGN' for label in labels]

    for token, label, mask in zip(decoded_tokens, decoded_labels, attention_mask):
        if mask == 1:
            print(f"{token}: {label}")



In [39]:
train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = dict([(value, key) for key, value in tag2idx.items()])
MAX_LEN = 128


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_data = NERDataset(train_tokens, train_tags, tokenizer, MAX_LEN, tag2idx)
# Check the first item in the dataset
item = train_data[1]

# Print each word with its tag
print_decoded_words_with_tags(item, tokenizer, idx2tag)


[CLS]: O
further: O
information: O
will: O
also: O
be: O
found: O
in: O
the: O
selection: O
from: O
the: O
red: O
book: O
of: O
west: B-LOC
##mar: IGN
##ch: IGN
that: O
has: O
already: O
been: O
published: O
under: O
the: O
title: O
of: O
the: O
ho: B-CHAR
##bb: I-CHAR
##it: I-CHAR
[SEP]: O


In [2]:
data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)

NameError: name 'train_tokens' is not defined

In [1]:
df.iloc[0]

NameError: name 'df' is not defined

In [53]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True,
                             return_offsets_mapping=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [54]:
training_set = dataset(df, tokenizer, MAX_LEN)

In [55]:
training_set[0]

TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'is_pretokenized'

In [51]:
train_tokens[0]

['this',
 'book',
 'is',
 'largely',
 'concerned',
 'with',
 'hobbits',
 'and',
 'from',
 'its',
 'pages',
 'a',
 'reader',
 'may',
 'discover',
 'much',
 'of',
 'their',
 'character',
 'and',
 'a',
 'little',
 'of',
 'their',
 'history']

In [52]:
train_tags[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-CHAR',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [38]:
print(train_data[0])

{'input_ids': tensor([  101,  2023,  2338,  2003,  4321,  4986,  2007,  7570, 10322, 12762,
         1998,  2013,  2049,  5530,  1037,  8068,  2089,  7523,  2172,  1997,
         2037,  2839,  1998,  1037,  2210,  1997,  2037,  2381,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [None]:
reverted_sentence[2]

In [None]:
tag2idx

In [None]:
len(tag2idx)

In [None]:
tag_values

In [None]:
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(tag_values))

In [None]:
if num_added_toks > 0:
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# Step 1: Load the test data
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")

# Step 2: Create a DataLoader for the test data
test_data = NERDataset(test_tokens, test_tags, tokenizer, MAX_LEN, tag2idx)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

In [None]:
model.load_state_dict(torch.load('ner_model_from_final.pth'))

In [None]:
def predict(model, dataloader, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=2)

            predictions.extend(preds.cpu().numpy())

    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predictions = predict(model, test_loader, device)


In [None]:
idx2tag

In [None]:
predicted_tags = [[idx2tag[idx] for idx in pred] for pred in predictions]


In [None]:
# Write predictions to the file
def write_predictions_to_file(predictions, tokens, filename):
    with open(filename, 'w') as f:
        for i, sentence in enumerate(tokens):
            for j, word in enumerate(sentence):
                pred_tag = predictions[i][j]
                f.write(f"{j+1}\t{word}\t{pred_tag}\n")
            f.write("\n")

In [None]:
write_predictions_to_file(predicted_tags, test_tokens, 'predictions.iob2')

In [None]:
avg_loss, f1, accuracy, true_labels_tags, true_preds_tags = evaluate(model, test_loader, device, tag2idx, idx2tag)

In [None]:
print(f"Final Test Loss: {avg_loss}")
print(f"Final Test F1 Score: {f1}")
print(f"Final Test Accuracy: {accuracy}")

In [None]:
model.load_state_dict(torch.load('ner_model_from_final.pth'))

In [None]:
model.eval()
total_loss = 0
all_preds = []
all_labels = []

In [None]:
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

        # Collect the predictions and true labels for calculating F1 score and accuracy
        all_preds.extend(predictions.cpu().numpy().tolist())
        all_labels.extend(batch['labels'].cpu().numpy().tolist())

In [None]:
all_labels_flat = [l for labels in all_labels for l in labels]

In [None]:
for batch in test_loader:
    print(batch)