In [5]:
import pytorch_lightning as pl
import torch
import wandb
from datasets import load_dataset
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from torch.optim import Adam
from transformers import BertForSequenceClassification, BertForTokenClassification
from transformers import BertTokenizer

from thegreatknowledgeheist.models.bert import SentimentBert


In [7]:
dataset = load_dataset("acronym_identification")

Using custom data configuration default
Reusing dataset acronym_identification (/home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc915425)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
def tokenize_and_preserve_labels(row, tokenizer):

    tokenized_sentence = []
    labels = []

    for word, label in zip(row['tokens'], row['labels']):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word.lower())
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)
    row['labels'] = labels
    return row

In [10]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased",
    return_tensors="pt",
    do_lower_case=True
)

In [11]:
dataloaders = {}
# test jest kijowy
for type in ["train", "validation"]:
    data = dataset[type]
    data = data.map(lambda x: tokenize_and_preserve_labels(x, tokenizer))
    encoded_data = data.map(
        lambda row: tokenizer(" ".join(row["tokens"]), padding="max_length", max_length=50, truncation=True)
    )
    encoded_data = encoded_data.map(
        lambda row: {"labels": row["labels"][: 50]
            + [0 for _ in range(50 - len(row["labels"]))]}
    )
    encoded_data.set_format(
        type="torch",
        columns=["input_ids", "token_type_ids", "attention_mask", "labels"],
    )
    dataloaders[type] = torch.utils.data.DataLoader(
        encoded_data, batch_size=16, num_workers=8
    )

Loading cached processed dataset at /home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc915425/cache-5a5f1ed11d73e260.arrow
Loading cached processed dataset at /home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc915425/cache-a438d046ddaa5cc1.arrow
Loading cached processed dataset at /home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc915425/cache-f3ab26e3078ccb76.arrow
Loading cached processed dataset at /home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc915425/cache-543ecd6df9c45d59.arrow
Loading cached processed dataset at /home/maria/.cache/huggingface/datasets/acronym_identification/default/1.0.0/e84facf8db848a4c7aa58addbebaf8a161c4146ca367e923ca972673cc9

In [12]:
model = BertForTokenClassification.from_pretrained(
            "bert-base-uncased", num_labels=5
        )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [15]:
for b in dataloaders['validation']:
    out = model(**b)
    loss, logits = out[:2]
    predicted_token_class_ids = logits.argmax(-1)
    print()
    print(loss)
    print(predicted_token_class_ids == b['labels'])
    break


tensor(1.6214, grad_fn=<NllLossBackward0>)
tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True,  True,  True,  True,
         False,  True, False,  True, False, False, False,  True, False, False],
        [False, False, False, False, False, False, False, False, False, False,
          True,  True, False, False,  True, False, False, False, False,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True, False,  True, False, False,  True, False],
        [ True,  True, False,  True, False, False, False, False, False,  True,
         False, False, False, False, False, False, False, False, False, False,
      

In [None]:
# https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=dyAL9FIRRcTg

In [19]:
from pathlib import Path
from thegreatknowledgeheist.data import get_dataloaders
GPUS = 1
NUM_WORKERS = 8
BATCH_SIZE = 8
LR = 0.001
EPS = 1e-8
MAX_EPOCHS = 50
PROJECT_ROOT = Path(
    "/home/maria/Documents/TheGreatKnowledgeHeist/thegreatknowledgeheist"
)
dataloaders = get_dataloaders(
    dataset_name="acronym_identification",
    path_to_dataset=str(PROJECT_ROOT / "data"),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

In [24]:
for b in dataloaders['train']:
    out = model(**b)
    loss, logits = out[:2]
    predicted_token_class_ids = logits.argmax(-1)
    print()
    print(loss)
    print(predicted_token_class_ids == b['labels'])
    break


tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False,  True],
        [False, False, False,  ...,  True,  True, False],
        [False,  True, False,  ...,  True,  True,  True]])
