In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.metrics import classification_report
import numpy as np
from sklearn.model_selection import train_test_split

In [10]:
MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 1
MODEL_NAME = 'bert-base-uncased'

In [11]:
def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(parts[2])
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels


In [12]:
def tokenize_and_preserve_labels(token_docs, tag_docs, tokenizer):
    tokenized_texts, labels = [], []
    for word_list, label_list in zip(token_docs, tag_docs):
        tmp_tokens, tmp_labels = [], []
        for word, label in zip(word_list, label_list):
            tokenized_word = tokenizer.tokenize(word)
            n_subwords = len(tokenized_word)
            tmp_tokens.extend(tokenized_word)
            tmp_labels.extend([label] * n_subwords)
        tokenized_texts.append(tmp_tokens)
        labels.append(tmp_labels)
    for tokens, lbls in zip(tokenized_texts, labels):
        assert len(tokens) == len(lbls), f"Token and label length mismatch: {len(tokens)} tokens, {len(lbls)} labels"
    return tokenized_texts, labels

In [13]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, max_len, tag2idx):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word_labels = self.tags[idx]
        encoding = self.tokenizer(sentence, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = [self.tag2idx['O']] * self.max_len  # Initialize labels with "O"
        offsets = encoding['offset_mapping'].squeeze().tolist()  # Get the offsets
        encoding.pop('offset_mapping')  # Remove offsets, not needed for model input

        idx = 0
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens
                labels[i] = self.tag2idx['O']
            elif start == 0:  # Start of a new word
                if idx < len(word_labels):
                    labels[i] = self.tag2idx[word_labels[idx]]
                else:
                    labels[i] = self.tag2idx['O']
                idx += 1
            else:  # Subtoken of a word
                labels[i] = -100  # PyTorch's convention to ignore these tokens in loss computation

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(labels)
        return item

In [14]:
def read_names(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        names = [name.strip().lower() for name in file.readlines()]
    return names

In [15]:
# Read the names for additional tokens
character_names = read_names('./scraping_res/character_names.txt')
location_names = read_names('./scraping_res/location_names.txt')
organization_names = read_names('./scraping_res/organization_names.txt')
all_names = character_names + location_names + organization_names


In [16]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
num_added_toks = tokenizer.add_tokens(all_names)




In [17]:
num_added_toks

1970

In [18]:
# Initialize the model
train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
tag_values = list(set(tag for doc in train_tags for tag in doc))
tag_values.append("PAD")
tag2idx = {tag: idx for idx, tag in enumerate(tag_values)}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(tag2idx))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
idx2tag

{0: 'B-ORG',
 1: 'O',
 2: 'I-LOC',
 3: 'B-LOC',
 4: 'B-CHAR',
 5: 'I-CHAR',
 6: 'PAD'}

In [24]:
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")

# Combine train and test tags to create a comprehensive tag set
all_tags = set(tag for doc in train_tags for tag in doc).union(set(tag for doc in test_tags for tag in doc))
all_tags.add("PAD")  # Add the PAD tag

In [25]:
print(f"All tags: {all_tags}")

All tags: {'B-ORG', 'O', 'I-LOC', 'I--LOC', 'B--ORG', 'B-LOC', 'B--LOC', 'B-CHAR', 'PAD', 'I-CHAR'}


In [11]:
if num_added_toks > 0:
    model.resize_token_embeddings(len(tokenizer))


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [13]:
print(device)

cuda


In [14]:
def train_and_evaluate(dataset_class, tokenizer, tag2idx, idx2tag, train_tokens, train_tags, size):
    train_tokens_subset, _, train_tags_subset, _ = train_test_split(train_tokens, train_tags, train_size=size, random_state=42)
    train_data = dataset_class(train_tokens_subset, train_tags_subset, tokenizer, MAX_LEN, tag2idx)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE)
    
    model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(tag2idx))
    if num_added_toks > 0:
        model.resize_token_embeddings(len(tokenizer))
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=3e-5)

    model.train()
    for epoch in range(EPOCHS):
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if step % 10 == 0:
                print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.item()}")

    test_sentences, test_labels = read_data("./tagged_sentences_test.iob2")
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for sentence, label in zip(test_sentences, test_labels):
            inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LEN)
            labels = [tag2idx[lbl] for lbl in label]
            labels = labels[:MAX_LEN] + [tag2idx['O']] * (MAX_LEN - len(labels))
            labels = torch.tensor(labels).unsqueeze(0).to(device)

            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    average_loss = total_loss / len(test_sentences)
    return average_loss


In [15]:
def generate_learning_curve(dataset_class, tokenizer, tag2idx, idx2tag):
    train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
    sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
    losses = []
    for size in sizes:
        print(f"Training with {int(size*100)}% of the data")
        loss = train_and_evaluate(dataset_class, tokenizer, tag2idx, idx2tag, train_tokens, train_tags, size)
        losses.append(loss)
    plt.figure(figsize=(10, 6))
    plt.plot([size*100 for size in sizes], losses, marker='o')
    plt.title('Learning Curve')
    plt.xlabel('Training Data Size (%)')
    plt.ylabel('Average Loss')
    plt.grid(True)
    plt.show()


In [16]:
generate_learning_curve(NERDataset, tokenizer, tag2idx, idx2tag)

Training with 10% of the data


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0, Loss: 1.909500002861023
Epoch 1, Step 10, Loss: 0.06149842590093613
Epoch 1, Step 20, Loss: 0.0300990492105484
Epoch 1, Step 30, Loss: 0.03399265557527542


ValueError: Expected input batch_size (5916) to match target batch_size (174).