In [None]:
# !pip3 install datasets
# !pip3 install rouge_score
# !pip3 install git+https://github.com/huggingface/transformers
# !pip3 install sentencepiece
# !pip3 install torch
# !pip3 install transformers
# !pip install --upgrade transformers
# !pip install --upgrade datasets
# !pip install tensorflow
# !pip install ipywidgets

In [None]:
# https://huggingface.co/yikuan8/Clinical-Longformer

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
import os
import numpy as np
import pandas as pd

In [None]:
lr = 3e-5  # from paper
batch_size = 32
max_input_length = 4096
max_output_length = 1024

In [None]:
model_checkpoint = "yikuan8/Clinical-Longformer"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
model = AutoModelForMaskedLM.from_pretrained("../Clinical-Longformer")

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["article"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["lay_summary"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [None]:
def load_article_dataset(dtype, filename, directory):
    path = os.path.join(directory, f'{dtype}/{filename}_{dtype}.jsonl')
    df = pd.read_json(path, lines=True, nrows=100)
    return Dataset.from_pandas(df)

def create_article_dataset_dict(filename, directory):
    dataset_types = ['train', 'val']
    datasets = {}
    for dtype in dataset_types:
        dataset = load_article_dataset(dtype, filename, directory)
        # dataset = dataset.map(lambda e: tokenizer(e['lay_summary'], truncation=True, 
        #                                                   padding='max_length', max_length=max_input_length),batched=True)
        # dataset['labels'] = dataset['input_ids']
        # dataset = dataset.map(lambda e: tokenizer(e['article'], truncation=True, 
        #                                                       padding='max_length', max_length=max_input_length), batched=True)
        # Apply the preprocess_function to the dataset
        dataset = dataset.map(
            process_data_to_model_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "lay_summary", "headings"],
        )
        # Add the preprocessed dataset to the datasets dictionary
        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
        datasets[dtype] = dataset
    return DatasetDict(datasets)

In [None]:
filename = "eLife"
directory = "../data/task1_development/"
article_dataset = create_article_dataset_dict(filename, directory)

In [None]:
num_train_epochs = 8
model_name = model_checkpoint.split("/")[-1]

In [None]:
from random import seed

def set_seed(seed_v: int = 42) -> None:
    np.random.seed(seed_v)
    seed(seed_v)
    torch.manual_seed(seed_v)
    torch.cuda.manual_seed(seed_v)
    # When running on the CuDNN backend, two further options must be set
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed_v)
    print(f"Random seed set as {seed_v}")

In [None]:
print(torch.cuda.is_available())
torch.cuda.empty_cache()

In [None]:
set_seed(42)

In [None]:
# ! pip install rouge

In [None]:
from rouge import Rouge

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rouge = Rouge()
    scores = rouge.get_scores(predictions, labels, avg=True)
    return {
        "rouge1_f": scores["rouge-1"]["f"],
        "rouge2_f": scores["rouge-2"]["f"],
        "rougeL_f": scores["rouge-l"]["f"]
    }

In [None]:
args = TrainingArguments(
        output_dir='../tmp/',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='rouge2_f',
        run_name=model_name,
    )

In [None]:
trainer = Trainer(
        model=model,
        args=args,
        train_dataset=article_dataset['train'],
        eval_dataset=article_dataset['val'],
        compute_metrics=compute_metrics,
    )

In [None]:
trainer.train()