In [None]:
# !pip3 install datasets
# !pip3 install rouge_score
# !pip3 install git+https://github.com/huggingface/transformers
# !pip3 install sentencepiece
# !pip3 install torch
# !pip3 install transformers
# !pip install --upgrade transformers
# !pip install --upgrade datasets
# !pip install tensorflow
# !pip install ipywidgets

In [None]:
# https://huggingface.co/yikuan8/Clinical-Longformer

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import torch
import os
import numpy as np
import pandas as pd

In [None]:
model_checkpoint = "yikuan8/Clinical-Longformer"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
model = AutoModelForMaskedLM.from_pretrained("../Clinical-Longformer")

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def preprocess_function(sample, max_input_length: int = 4096, max_target_length: int = 4096):
    """
    Tokenizes the article and summary texts in a given sample and creates a dictionary of model inputs 
    that can be used for training a language model.

    Args:
        sample (dict): A dictionary containing the article and lay summary texts.
        max_input_length (int, optional): The maximum length of the tokenized article text. Defaults to 4096.
        max_target_length (int, optional): The maximum length of the tokenized summary text. Defaults to 4096.

    Returns:
        dict: A dictionary containing the tokenized article text and the tokenized summary text as "labels".
    """
    # Tokenize the article text using the provided `max_input_length` and `truncation=True` flag.
    model_inputs = tokenizer(sample["article"], max_length=max_input_length, truncation=True)

    # Tokenize the summary text using the provided `max_target_length` and `truncation=True` flag.
    labels = tokenizer(sample["lay_summary"], max_length=max_target_length, truncation=True)

    # Add the tokenized summary text to the `model_inputs` dictionary with key "labels".
    model_inputs["labels"] = labels["input_ids"]

    # Return the `model_inputs` dictionary as the output of the function.
    return model_inputs

In [None]:
class Clinical(Dataset):
    def __init__(self, dtype='train', filename="eLife", dir="../data/task1_development/"):
        assert filename in ['PLOS','eLife']
        assert dtype in ['train', 'val']
        path = os.path.join(dir, f'{dtype}/{filename}_{dtype}.jsonl')
        self.df = pd.read_json(path, lines=True)

    def __getitem__(self, index):
        article, lay_summary = self.df.iloc[index][['article', 'lay_summary']]
        # if self.transform:
        #     sample = {'article': article, 'lay_summary': lay_summary}
        #     model_inputs = self.transform(sample)
        #     article, lay_summary = model_inputs['article'], model_inputs['lay_summary']
        #     return 0
        # else:
        return article, lay_summary

    def __len__(self):
        return len(self.df)

In [None]:
train_dataset = Clinical(filename="eLife", dtype ="train")
val_dataset = Clinical(filename="eLife", dtype ="val")

In [None]:
train_dataset.__getitem__(10)

In [None]:
dtype='train'
filename="eLife" 
dir="../data/task1_development/"

path = os.path.join(dir, f'{dtype}/{filename}_{dtype}.jsonl')
df_train = pd.read_json(path, lines=True)

dtype='val'
path = os.path.join(dir, f'{dtype}/{filename}_{dtype}.jsonl')
df_val = pd.read_json(path, lines=True)


article_dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "val": Dataset.from_pandas(df_val)
})

In [None]:
tokenized_datasets = article_dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    article_dataset["train"].column_names
)

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
tokenized_datasets["train"]

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)

tf_eval_dataset = tokenized_datasets["val"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

In [None]:
num_train_epochs = 8
num_train_steps = len(tf_train_dataset) * num_train_epochs
model_name = model_checkpoint.split("/")[-1]

In [None]:
optimizer, schedule = create_optimizer(
    init_lr=5.6e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

In [None]:
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")