In [1]:
!pip install -q transformers datasets peft accelerate evaluate bitsandbytes rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.0 MB/s[0m eta [36

In [2]:
import os
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline,
    logging
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import random

In [3]:
# Load and filter dataset
dataset = load_dataset("csv", data_files="138_papers.csv")["train"]
dataset = dataset.filter(
    lambda x: x['article'] is not None and bool(x['article'].strip()) and
              x['abstract'] is not None and bool(x['abstract'].strip())
)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["article", "abstract"]])

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = split_dataset['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = split_dataset['train']
eval_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/138 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['article', 'abstract'],
    num_rows: 138
})

In [4]:
lengths = [len(x.split()) for x in dataset["abstract"]]
print(f"Avg summary length: {np.mean(lengths):.1f} words")

Avg summary length: 122.6 words


In [None]:
# # bitsandbytes parameters
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype = "float16",
#     bnb_4bit_use_double_quant = False
# )

In [5]:
model_name = "allenai/led-large-16384-arxiv"

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": 0} #"auto"
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [6]:
max_input_length = 8192
max_target_length = 256

def preprocess(batch):
    inputs = tokenizer(
        batch["article"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )

    outputs = tokenizer(
        batch["abstract"],
        truncation=True,
        padding="max_length",
        max_length=max_target_length
    )

    # Replace pad token with -100 in labels
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in outputs["input_ids"]
    ]

    # Set global attention on the first token (commonly used strategy)
    global_attention_mask = []
    for input_ids in inputs["input_ids"]:
        mask = [0] * len(input_ids)
        mask[0] = 1  # First token gets global attention
        global_attention_mask.append(mask)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "global_attention_mask": global_attention_mask,
        "labels": labels,
    }

# Now when batching (map):
train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [7]:
# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
        target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj"
    ],
    lora_dropout=0.00,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 462,160,896 || trainable%: 0.5105


In [8]:
# Define a function to compute ROUGE scores
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # # Get the predicted token ids (by taking argmax on the logits)
    # predicted_ids = predictions.argmax(-1)

    # Decode the predictions and labels (removing padding tokens)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we ignored padding in loss
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    result = {k: round(v * 100, 4) for k, v in result.items()}

    return result

In [9]:
# Evaluation metric
rouge = evaluate.load("rouge")

# Custom collator to preserve global_attention_mask
class LEDDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features, return_tensors=None):
        batch = super().__call__(features, return_tensors=return_tensors)
        if "global_attention_mask" in features[0]:
            batch["global_attention_mask"] = torch.tensor([f["global_attention_mask"] for f in features])
        return batch

data_collator = LEDDataCollator(tokenizer, model=model, padding=True)


training_args = Seq2SeqTrainingArguments(
    output_dir="./led-lora-ft",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    gradient_checkpointing=False,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    optim = "adamw_torch",
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length=256,
    generation_num_beams=1,
    logging_steps=100,
    save_steps=0,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="rougeLsum",
    fp16=True,
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()
model.save_pretrained("./led-lora-arxiv-ft")
tokenizer.save_pretrained("./led-lora-arxiv-ft")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.692168,29.7425,7.3631,16.2577,20.4608
2,3.837300,3.479042,28.9885,6.6921,16.2408,19.9964
3,3.837300,3.481701,29.5655,6.5833,16.492,19.8802




('./led-lora-arxiv-ft/tokenizer_config.json',
 './led-lora-arxiv-ft/special_tokens_map.json',
 './led-lora-arxiv-ft/vocab.json',
 './led-lora-arxiv-ft/merges.txt',
 './led-lora-arxiv-ft/added_tokens.json',
 './led-lora-arxiv-ft/tokenizer.json')

In [11]:
def evaluate_model(model, dataset, tokenizer, name="Model"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    preds = []
    labels = []
    model.eval()

    for sample in dataset:
        # Convert to tensors
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(model.device)

        label_ids = sample["labels"]
        if not isinstance(label_ids, list):
            label_ids = label_ids.tolist()  # Handle tensor input

        # Filter out invalid token IDs
        safe_label_ids = [id for id in label_ids if isinstance(id, int) and 0 <= id < tokenizer.vocab_size]

        label = tokenizer.decode(safe_label_ids, skip_special_tokens=True)

        # Create global attention mask
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1  # Apply global attention on the first token

        # Generate prediction
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,  # Use global attention mask
            max_new_tokens=256,
            num_beams=2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )[0]

        pred = tokenizer.decode(output_ids, skip_special_tokens=True)
        preds.append(pred)
        labels.append(label)

    results = rouge.compute(predictions=preds, references=labels)
    print(f"\n{name} ROUGE Scores:")
    results = {k: round(v * 100, 2) for k, v in results.items()}
    for key, value in results.items():
        print(f"{key}: {value}")


In [12]:
def print_random_summary(model, dataset, tokenizer):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random_sample = dataset

    # Convert input to tensors
    input_ids = torch.tensor(random_sample["input_ids"]).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(random_sample["attention_mask"]).unsqueeze(0).to(model.device)

    # Create global attention mask
    global_attention_mask = torch.zeros_like(input_ids)
    global_attention_mask[:, 0] = 1  # Apply global attention on the first token

    # Generate prediction
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,  # Use global attention mask
        max_new_tokens=300,
        num_beams=2,
        no_repeat_ngram_size=3,
        early_stopping=True
    )[0]

    # Decode and print the generated summary
    generated_summary = tokenizer.decode(output_ids, skip_special_tokens=True)

    # Print the generated summary
    print("\nGenerated Summary for Random Sample:")
    print(generated_summary)

    label_ids = random_sample["labels"]
    if not isinstance(label_ids, list):
        label_ids = label_ids.tolist()

    safe_label_ids = [id for id in label_ids if isinstance(id, int) and 0 <= id < tokenizer.vocab_size]
    reference_summary = tokenizer.decode(safe_label_ids, skip_special_tokens=True)

    print("\nReference Summary (Label):")
    print(reference_summary)

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": 0} #"auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

evaluate_model(model, test_dataset, tokenizer, "Pretrained")


Pretrained ROUGE Scores:
rouge1: 24.96
rouge2: 6.44
rougeL: 15.46
rougeLsum: 17.03


In [14]:
print_random_summary(model, test_dataset[0], tokenizer)


Generated Summary for Random Sample:
 this work establishes the chemputer as a universal chemical synthesis machine , capable of constructing any stable and isolable molecule through a finite ,  
 expressible process . 
 this process is governed by three key parameters : reagents , process  conditions , and catalysts .    the study introduces dynamic error correction  mechanisms integrated into each step of the synthesis pathway , ensuring real -time accuracy 
 and reliability .  the role of universally configurable hardwa re is also highlighted , with the introduction of a   ``chempiling '' function that translates synthesis pathways into executable  hardware configurations . 

Reference Summary (Label):
The paper explains that the "chemputer" is a universal machine capable of performing any feasible chemical synthesis. This means that as long as a chemical process can be carried out within the physical limitations of the available equipment, the chemputer can execute it. The key to 

In [15]:
ft_model = PeftModel.from_pretrained(model, "./led-lora-arxiv-ft")
ft_tokenizer = AutoTokenizer.from_pretrained("./led-lora-arxiv-ft")

evaluate_model(ft_model, test_dataset, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 26.04
rouge2: 6.79
rougeL: 16.27
rougeLsum: 18.13


In [16]:
print_random_summary(ft_model, test_dataset[0], ft_tokenizer)


Generated Summary for Random Sample:
 this work establishes the chemputer as a universal chemical synthesis machine , capable of constructing any stable and isolable molecule through a finite ,  
 expressible process . 
 this process is governed by three key parameters : reagents , process  conditions , and catalysts .    the study introduces dynamic error correction 
 mechanisms integrated into each step of the synthesis pathway , ensuring real -time accuracy 
 and reliability .  the role of universally configurable hardwa re is also highlighted , with the introduction of a   ``chempiling '' function that translates synthesis pathways into executable  hardware configurations . 

Reference Summary (Label):
The paper explains that the "chemputer" is a universal machine capable of performing any feasible chemical synthesis. This means that as long as a chemical process can be carried out within the physical limitations of the available equipment, the chemputer can execute it. The key to

## Hyperparameters and Run Summary

| Experiment                  | r   | lora_alpha | target_modules                         | lora_dropout | bias | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum |
|-----------------------------|-----|------------|----------------------------------------|--------------|------|---------|---------|---------|-------------|
| Pretrained                  | –   | –          | –                                      | –            | –    | 22.02   | 5.79    | 12.02   | 15.74       |
| Fine-tuned #1 (4bit)        | 8   | 16         | ["query", "value"]                     | 0.0          | none | 21.17   | 5.71    | 11.53   | 15.06       |
| Pretrained (FP16)           | –   | –          | –                                      | –            | –    | 26.42   | 5.92    | 13.32   | 18.32       |
| Fine-tuned #2 (FP16)        | 8   | 16         | ["query", "value"]                     | 0.0          | none | 26.30   | 5.59    | 13.65   | 18.64       |
| Pretrained                  | –   | –          | –                                      | –            | –    | 26.83   | 4.52    | 14.53   | 17.17       |
| Fine-tuned #3 Global Attn   | 8   | 16         | ["query", "value", "key", "output"]    | 0.0          | none | 26.79   | 4.61    | 14.56   | 17.21       |
| Pretrained                  | –   | –          | –                                      | –            | –    | 28.48   | 4.34    | 14.29   | 16.73       |
| Fine-tuned #4 Global Attn   | 8   | 16         | ["query", "value", "key", "output"]    | 0.0          | none | 27.86   | 4.05    | 14.29   | 16.54       |
| Pretrained                  | –   | –          | –                                      | –            | –    | 29.10   | 3.97    | 14.96   | 18.74       |
| Fine-tuned #5 Global Attn   | 16  | 16         | ["query", "value", "key", "output"]    | 0.0          | none | 28.69   | 4.87    | 15.09   | 18.82       |
| Pretrained                  | –   | –          | –                                      | –            | –    | 24.85   | 6.33    | 15.57   | 17.25       |
| FT #Final Global Attn       | 16  | 32         | ["query", "value", "key", "output"]    | 0.0          | none | 25.86   | 6.74    | 15.73   | 17.97       |


### Ignore

In [None]:
evaluate_model(model, test_dataset, tokenizer, "Pretrained")


Pretrained ROUGE Scores:
rouge1: 51.85
rouge2: 27.95
rougeL: 36.42
rougeLsum: 45.68


In [None]:
print_random_summary(model, test_dataset[0], tokenizer)


Generated Summary for Random Sample:
 this review focuses specifically on what we have learned about the progenitors of core - collapse supernovae ( cc  sne ) by examining images of the supernova ( sn ) sites taken prior to the explosion . by registering pre - sn and post - sn images , usually taken at high resolution using either space - based optical detectors , or ground - based infrared detectors equipped with laser guide star adaptive optics systems ( lgs - ao ) , about one dozen cc sn progenitor stars have now been directly detected ( i.e. , shown to be spatially coincident with the sn ) , with roughly two dozen upper limits derived from non - detections .  
 the science of seeking sn progensitors has made tremendous strides in the last decade , and promises to advance rapidly as more and more nearby galaxies  hosts of future cc sne  have high - resolution images added to the archive . 

Reference Summary (Label):
i summarize what we have learned about the nature of stars that u

In [None]:
ft_model = PeftModel.from_pretrained(model, "./led-lora-arxiv-ft")
ft_model = ft_model.merge_and_unload()
ft_tokenizer = AutoTokenizer.from_pretrained("./led-lora-arxiv-ft")

evaluate_model(ft_model, test_dataset, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 51.85
rouge2: 27.95
rougeL: 36.42
rougeLsum: 45.68


In [None]:
evaluate_model(ft_model, test_dataset, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 51.85
rouge2: 27.95
rougeL: 36.42
rougeLsum: 45.68


In [None]:
evaluate_model(ft_model, dataset1, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 46.89
rouge2: 20.58
rougeL: 28.69
rougeLsum: 39.92


In [None]:
print_random_summary(ft_model, test_dataset[0], ft_tokenizer)


Generated Summary for Random Sample:
 this review focuses specifically on what we have learned about the progenitors of core - collapse supernovae ( cc  sne ) by examining images of the supernova ( sn ) sites taken prior to the explosion . by registering pre - sn and post - sn images , usually taken at high resolution using either space - based optical detectors , or ground - based infrared detectors equipped with laser guide star adaptive optics systems ( lgs - ao ) , about one dozen cc sn progenitor stars have now been directly detected ( i.e. , shown to be spatially coincident with the sn ) , with roughly two dozen upper limits derived from non - detections .  
 the science of seeking sn progensitors has made tremendous strides in the last decade , and promises to advance rapidly as more and more nearby galaxies  hosts of future cc sne  have high - resolution images added to the archive . 

Reference Summary (Label):
i summarize what we have learned about the nature of stars that u