In [10]:
import pandas as pd
from transformers import LEDForConditionalGeneration, LEDTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split
from transformers import LEDConfig
from rouge import Rouge
import glob
import os

In [12]:
import sys
sys.path.append("../laysummarisation")
import laysummarisation
from laysummarisation.utils import (
    compute_metrics,
    create_article_dataset_dict,
    set_seed,
    load_jsonl_pandas,
    load_multiple_df
)


In [13]:
seed = 42

In [14]:
set_seed(seed)

Random seed set as 42


In [15]:
all_files = glob.glob(os.path.join("../data/input/rouge", "*.jsonl"))
df = load_multiple_df(all_files)

# df.iloc[0].article
df.columns

Index(['lay_summary', 'article', 'headings', 'keywords', 'id'], dtype='object')

In [16]:
# Split the dataset into training and evaluation sets
train_df, eval_df = train_test_split(df.head(100), test_size=0.2, random_state=42)

# Load the tokenizer and the model
model_checkpoint = "yikuan8/Clinical-Longformer"
tokenizer = LEDTokenizer.from_pretrained(model_checkpoint)

# Create the Longformer configuration
lf_config = LEDConfig.from_pretrained(model_checkpoint)


# Update the attention_window parameter
lf_config.attention_window = [16] * lf_config.num_hidden_layers
model.config.num_beams = conf.nbeams
model.config.max_length = conf.max_encode
model.config.min_length = conf.min_encode
model.config.length_penalty = conf.length_penalty
model.config.early_stopping = conf.early_stopping
model.config.no_repeat_ngram_size = 3
model.to(device)


model = LEDForConditionalGeneration.from_pretrained(model_checkpoint, config=lf_config)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LongformerTokenizer'. 
The class this function is called from is 'LEDTokenizer'.
You are using a model of type longformer to instantiate a model of type led. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at yikuan8/Clinical-Longformer were not used when initializing LEDForConditionalGeneration: ['longformer.encoder.layer.7.attention.self.query_global.bias', 'longformer.encoder.layer.10.attention.self.value.weight', 'longformer.encoder.layer.2.output.dense.weight', 'longformer.encoder.layer.8.attention.output.LayerNorm.weight', 'longformer.encoder.layer.3.intermediate.dense.weight', 'longformer.encoder.layer.7.attention.self.value_global.bias', 'longformer.encoder.layer.2.attention.self.query_global.bias', 'longf

In [5]:
from torch.utils.data import Dataset

class eLifeDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length=1024, max_output_length=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        article, lay_summary = row['article'], row['lay_summary']

        input_tokenized = self.tokenizer(
            article,
            return_tensors="pt",
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
        )

        target_tokenized = self.tokenizer(
            lay_summary,
            return_tensors="pt",
            max_length=self.max_output_length,
            truncation=True,
            padding="max_length",
        )

        input_ids = input_tokenized["input_ids"].squeeze()
        target_ids = target_tokenized["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": target_ids}


In [7]:
# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir="longformer_summary_model",
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=100,
    save_steps=500,
    warmup_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Create the datasets
train_dataset = eLifeDataset(train_df, tokenizer)
eval_dataset = eLifeDataset(eval_df, tokenizer)

# Set up the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
  )

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("longformer_summary_model")
tokenizer.save_pretrained("longformer_summary_model")

Step,Training Loss,Validation Loss
100,7.8465,8.059049
200,7.4196,7.668071
300,6.7691,7.264815
400,6.0445,7.025229


('longformer_summary_model/tokenizer_config.json',
 'longformer_summary_model/special_tokens_map.json',
 'longformer_summary_model/vocab.json',
 'longformer_summary_model/merges.txt',
 'longformer_summary_model/added_tokens.json')