In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install torch
!pip install seqeval  # for evaluation metrics
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets

Mounted at /content/drive
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_

In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

def iob_tagging(text, annotations):
    sentences = sent_tokenize(text)
    all_tokens = []
    all_tags = []

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tags = ['O'] * len(tokens)
        sentence_start = text.index(sentence)
        token_positions = []
        position = sentence_start
        for token in tokens:
            position = text.find(token, position)
            token_positions.append((position, position + len(token)))
            position += len(token)

        for annotation in annotations:
            start, end = annotation['start'], annotation['end']
            label = annotation['tag']
            start_token = next((i for i, pos in enumerate(token_positions) if pos[0] <= start < pos[1]), None)
            end_token = next((i for i, pos in enumerate(token_positions) if pos[0] < end <= pos[1]), None)

            if start_token is not None and end_token is not None and start_token < len(tags) and end_token < len(tags):
                tags[start_token] = f'B-{label}'
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f'I-{label}'

        all_tokens.append(tokens)
        all_tags.append(tags)

    return all_tokens, all_tags

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
with open('/content/drive/MyDrive/just-citation-aufl_annotations.json', 'r') as file:
    data_aufl = json.load(file)

with open('/content/drive/MyDrive/just-citation-checking_annotations.json', 'r') as file:
    data_sentences = json.load(file)

with open('/content/drive/MyDrive/new_just-citation-aufl_annotations.json', 'r') as file:
    new_data_aufl = json.load(file)

with open('/content/drive/MyDrive/new_just-citation-checking_annotations.json', 'r') as file:
    new_data_sentences = json.load(file)

texts = []
sentences = []
entities = []

label_list = ['O', 'B-citation', 'I-citation']
label_map = {label: i for i, label in enumerate(label_list)}

tag2id = {"O": 0, "B-citation": 1, "I-citation": 2}
id2tag = {0: "O", 1: "B-citation", 2: "I-citation"}

loopz = [data_aufl, data_sentences, new_data_aufl, new_data_sentences]

for i in loopz:
  for document in i['examples']:
    if document['annotations'] != []:
      text = document['content']
      annotations = document['annotations']
      if annotations != []:
        token_lists, tag_lists = iob_tagging(text, annotations)
        flattened_token_lists = [item for row in token_lists for item in row]
        flattened_tag_lists = [tagz for columnz in tag_lists for tagz in columnz]
        sentences.append(flattened_token_lists)
        entities.append(flattened_tag_lists)
        texts.append(text)

tokenized_sentences = sentences
iob_tags = entities

tags_flattened = [item for row in iob_tags for item in row]

In [None]:
# post-labeling-program data

with open('/content/drive/MyDrive/aufl_just_citation_data.json', 'r') as file:
  post_labeling_aufl = json.load(file)

with open('/content/drive/MyDrive/sentences_just_citation_data.json', 'r') as file:
  post_labeling_sentences = json.load(file)

for i in post_labeling_aufl['sentences']:
  texts.append(i)

for i in post_labeling_aufl['tokenized_sentence']:
  tokenized_sentences.append(i)

for i in post_labeling_aufl['predicted_labels']:
  iob_tags.append(i)

print(len(texts))
print(len(tokenized_sentences))
print(len(iob_tags))

for i in post_labeling_sentences['sentences']:
  texts.append(i)

for i in post_labeling_sentences['tokenized_sentence']:
  tokenized_sentences.append(i)

for i in post_labeling_sentences['predicted_labels']:
  iob_tags.append(i)

print(len(texts))
print(len(tokenized_sentences))
print(len(iob_tags))

466
466
466
779
779
779


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report, accuracy_score
!pip install psutil transformers datasets
import psutil
import time
from datasets import load_metric

# custom callback for memory usage
class MemoryUsageCallback:
    def __init__(self):
        self.process = psutil.Process()
        self.mem_usage = []

    def on_epoch_end(self, args, state, control, **kwargs):
        mem_info = self.process.memory_info().rss / 1024 ** 2  # in MB
        self.mem_usage.append(mem_info)
        print(f'Epoch {state.epoch} - Memory Usage: {mem_info:.2f} MB')

    def on_train_end(self, args, state, control, **kwargs):
        print(f'Max Memory Usage: {max(self.mem_usage):.2f} MB')

memory_callback = MemoryUsageCallback()



In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, max_len):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tags = self.tags[idx]
        encoding = self.tokenizer(text, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len)
        labels = [tag2id[tag] if tag in tag2id else -100 for tag in tags]
        encoded_labels = [-100] * len(encoding['input_ids'])

        i = 0
        for idx, offset in enumerate(encoding['offset_mapping']):
            if offset[0] == 0 and i < len(tags):
                encoded_labels[idx] = labels[i]
                i += 1

        encoding['labels'] = encoded_labels
        del encoding['offset_mapping']
        return {key: torch.tensor(val) for key, val in encoding.items()}

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(tag2id))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.model_selection import train_test_split

train_texts, remaining_texts, train_tags, remaining_tags = train_test_split(tokenized_sentences, iob_tags, test_size=0.2, random_state=42)

print(train_tags)
print(remaining_tags)

val_texts, test_texts, val_tags, test_tags = train_test_split(remaining_texts, remaining_tags, test_size=0.5, random_state=42)

train_dataset = NERDataset(train_texts, train_tags, tokenizer, max_len=128)
val_dataset = NERDataset(val_texts, val_tags, tokenizer, max_len=128)
test_dataset = NERDataset(test_texts, test_tags, tokenizer, max_len=128)
eval_dataset = test_dataset

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

[['B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
from datasets import load_metric

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=10,
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

metric = load_metric("seqeval")

class CustomTrainer(Trainer):
    def __init__(self, memory_callback, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_callback = memory_callback

    def train(self, resume_from_checkpoint=None, trial=None, **kwargs):
        super().train(resume_from_checkpoint, trial, **kwargs)
        self.memory_callback.on_train_end(self.args, self.state, self.control)

    def evaluation_loop(self, *args, **kwargs):
        eval_dataloader = self.get_eval_dataloader()
        self.memory_callback.on_epoch_end(self.args, self.state, self.control)
        return super().evaluation_loop(*args, **kwargs)

trainer = CustomTrainer(
    memory_callback=memory_callback,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
trainer.train()

Step,Training Loss
10,1.2133
20,1.1376
30,1.025
40,0.8909
50,0.839
60,0.758
70,0.6944
80,0.7201
90,0.5731
100,0.5598


ValueError: max() arg is an empty sequence

In [None]:
from sklearn.metrics import classification_report

results = trainer.evaluate()
print(f"Testing accuracy: {results['eval_accuracy']}")

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

predictions, labels, _ = trainer.predict(eval_dataset)
pred_tags, true_tags = align_predictions(predictions, labels)

flat_pred_tags = [tag for pred in pred_tags for tag in pred]
flat_true_tags = [tag for true in true_tags for tag in true]

print(classification_report(flat_true_tags, flat_pred_tags))

Testing accuracy: 0.9053826745164003
              precision    recall  f1-score   support

  B-citation       0.74      0.56      0.63        90
  I-citation       0.92      0.94      0.93      1264
           O       0.90      0.90      0.90      1024

    accuracy                           0.91      2378
   macro avg       0.85      0.80      0.82      2378
weighted avg       0.90      0.91      0.90      2378



In [None]:
def combine_bio_tags(labels):
    combined_labels = []
    for label in labels:
        if label.startswith('B-') or label.startswith('I-'):
            combined_labels.append(label[2:])
        else:
            combined_labels.append(label)
    return combined_labels

true_labels = flat_true_tags
true_labels_combined = combine_bio_tags(true_labels)
predicted_labels_combined = combine_bio_tags(flat_pred_tags)

report = classification_report(true_labels_combined, predicted_labels_combined)
print(report)

              precision    recall  f1-score   support

           O       0.90      0.90      0.90      1024
    citation       0.92      0.92      0.92      1354

    accuracy                           0.91      2378
   macro avg       0.91      0.91      0.91      2378
weighted avg       0.91      0.91      0.91      2378

