In [None]:
!pip install -q transformers datasets peft accelerate evaluate bitsandbytes rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.6 MB/s[0m eta [3

In [None]:
import os
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline,
    logging
)
 import random

In [None]:
# Load and filter dataset
dataset = load_dataset("csv", data_files="502_rows.csv")["train"]
dataset = dataset.filter(lambda x: x['article'] is not None and bool(x['article'].strip()))
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["article", "abstract"]])

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = split_dataset['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = split_dataset['train']
eval_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']

In [None]:
dataset

Dataset({
    features: ['article', 'abstract'],
    num_rows: 502
})

In [None]:
len(train_dataset), len(eval_dataset), len(test_dataset)

(401, 50, 51)

In [None]:
lengths = [len(x.split()) for x in dataset["article"]]
print(lengths)

[2574, 2198, 1245, 2592, 2455, 2628, 2005, 2462, 2469, 1854, 2385, 2276, 1129, 2507, 1683, 2127, 1950, 2512, 2401, 2514, 1914, 2450, 2487, 1458, 2495, 1527, 1628, 2627, 2392, 2612, 2413, 2723, 2388, 2566, 2625, 2430, 2519, 2452, 2331, 2465, 2103, 2457, 2383, 2583, 2214, 2334, 1893, 2442, 2273, 2302, 2219, 2356, 2501, 1986, 2438, 1746, 45, 1330, 2823, 2671, 2713, 2232, 2500, 1757, 1527, 2582, 4349, 1450, 2746, 2435, 2643, 2025, 2441, 2308, 2772, 2525, 1870, 2279, 1932, 1326, 2273, 2489, 2511, 3085, 1180, 2471, 2530, 16, 2295, 2414, 2541, 1384, 1743, 2379, 2518, 2493, 1573, 2563, 2254, 2497, 1266, 2522, 2388, 2087, 2445, 2530, 1275, 2529, 2483, 1338, 2384, 2493, 2421, 2184, 2491, 2453, 2262, 2590, 2029, 1298, 2595, 2446, 2928, 2616, 2422, 1280, 2597, 2424, 1753, 2425, 2376, 2403, 1849, 2386, 2373, 2256, 2515, 2410, 2401, 2461, 2700, 2467, 1067, 1840, 1853, 2478, 1429, 2487, 2108, 2597, 2490, 2205, 2428, 2289, 2502, 2293, 2315, 847, 1422, 1114, 2329, 1553, 2276, 2375, 2494, 2352, 2317, 17

In [None]:
# # bitsandbytes parameters
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype = "float16",
#     bnb_4bit_use_double_quant = False
# )

In [None]:
model_name = "allenai/led-large-16384"

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    device_map={"": 0} #"auto"
).half()
model.config.use_cache = False

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
max_input_length = 4096
max_target_length = 256

def preprocess(batch):
    inputs = tokenizer(
        batch["article"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )

    outputs = tokenizer(
        batch["abstract"],
        truncation=True,
        padding="max_length",
        max_length=max_target_length
    )

    # Replace pad token with -100 in labels
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in outputs["input_ids"]
    ]

    # Set global attention on the first token (commonly used strategy)
    global_attention_mask = []
    for input_ids in inputs["input_ids"]:
        mask = [0] * len(input_ids)
        mask[0] = 1  # First token gets global attention
        global_attention_mask.append(mask)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "global_attention_mask": global_attention_mask,
        "labels": labels,
    }

# Now when batching (map):
train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
        target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj"
    ],
    lora_dropout=0.00,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 462,160,896 || trainable%: 0.5105


In [None]:
# Define a function to compute ROUGE scores
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    # Sometimes predictions are logits — check if needed
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # # Get the predicted token ids (by taking argmax on the logits)
    # predicted_ids = predictions.argmax(-1)

    # Decode the predictions and labels (removing padding tokens)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we ignored padding in loss
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Round the ROUGE scores to 4 decimal places
    result = {k: round(v * 100, 4) for k, v in result.items()}  # No need to use .mid.fmeasure

    return result

In [None]:
# Evaluation metric
rouge = evaluate.load("rouge")

# Custom collator to preserve global_attention_mask
class LEDDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features, return_tensors=None):
        batch = super().__call__(features, return_tensors=return_tensors)
        if "global_attention_mask" in features[0]:
            batch["global_attention_mask"] = torch.tensor([f["global_attention_mask"] for f in features])
        return batch

data_collator = LEDDataCollator(tokenizer, model=model, padding=True)


training_args = Seq2SeqTrainingArguments(
    output_dir="./led-lora-arxiv_lower_lr",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    gradient_checkpointing=False,
    learning_rate=5e-6,
    lr_scheduler_type="linear",
    weight_decay=0.001,
    optim = "adamw_torch",
    num_train_epochs=5,
    predict_with_generate=True,
    generation_num_beams=2,
    generation_max_length=256,
    logging_steps=100,
    save_steps=0,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="rougeLsum",
    fp16=True,
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Check model parameters before training
trainable_params = 0
all_params = 0
for name, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}% of all parameters)")

# Make sure at least some parameters require gradients
if trainable_params == 0:
    print("ERROR: No parameters require gradients - nothing will be trained!")
    # Ensure LoRA parameters are trainable if you're using LoRA
    for name, param in model.named_parameters():
        if any(trainable_part in name for trainable_part in ['lora', 'adapter']):
            print(f"Setting {name} to trainable")
            param.requires_grad = True

Trainable parameters: 2,359,296 (0.51% of all parameters)


In [None]:
trainer.train()
model.save_pretrained("./led-lora-arxiv-linear_with_wt_decay")
tokenizer.save_pretrained("./led-lora-arxiv-linear_with_wt_decay")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,4.0666,3.980075,29.6279,6.3888,16.1572,19.5461
2,4.0165,3.932779,29.7577,6.4342,16.4339,19.819
3,3.9614,3.87744,29.9186,6.3422,16.3699,19.6143
4,3.9229,3.82198,29.9485,6.5148,16.419,19.7581


('./led-lora-arxiv-linear_with_wt_decay/tokenizer_config.json',
 './led-lora-arxiv-linear_with_wt_decay/special_tokens_map.json',
 './led-lora-arxiv-linear_with_wt_decay/vocab.json',
 './led-lora-arxiv-linear_with_wt_decay/merges.txt',
 './led-lora-arxiv-linear_with_wt_decay/added_tokens.json',
 './led-lora-arxiv-linear_with_wt_decay/tokenizer.json')

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
def evaluate_model(model, dataset, tokenizer, name="Model"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    preds = []
    labels = []
    model.eval()

    for sample in dataset:
        # Convert to tensors
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(model.device)

        # Safely decode the labels
        label_ids = sample["labels"]
        if not isinstance(label_ids, list):
            label_ids = label_ids.tolist()  # Handle tensor input

        # Filter out invalid token IDs
        safe_label_ids = [id for id in label_ids if isinstance(id, int) and 0 <= id < tokenizer.vocab_size]

        label = tokenizer.decode(safe_label_ids, skip_special_tokens=True)

        # Create global attention mask (for LED, often the first token is set for global attention)
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1  # Apply global attention on the first token

        # Generate prediction
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,  # Use global attention mask
            max_new_tokens=256,
            num_beams=2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )[0]

        pred = tokenizer.decode(output_ids, skip_special_tokens=True)
        preds.append(pred)
        labels.append(label)

    # Calculate ROUGE scores
    rouge_results = rouge.compute(predictions=preds, references=labels)
    print(f"\n{name} ROUGE Scores:")
    rouge_results = {k: round(v * 100, 2) for k, v in rouge_results.items()}
    for key, value in rouge_results.items():
        print(f"{key}: {value}")

    # Calculate BERTScore
    try:
        from bert_score import score

        # BERTScore computation
        P, R, F1 = score(preds, labels, lang="en", verbose=True)

        # Calculate mean scores
        bert_results = {
            "BERTScore-P": round(P.mean().item() * 100, 2),
            "BERTScore-R": round(R.mean().item() * 100, 2),
            "BERTScore-F1": round(F1.mean().item() * 100, 2)
        }

        print(f"\n{name} BERTScore:")
        for key, value in bert_results.items():
            print(f"{key}: {value}")

        # You can combine the results if needed
        all_results = {**rouge_results, **bert_results}
        return all_results

    except ImportError:
        print("\nBERTScore calculation failed. Please install with: pip install bert-score")
        return rouge_results

In [None]:
def print_random_summary(model, dataset, tokenizer):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random_sample = dataset

    # Convert input to tensors
    input_ids = torch.tensor(random_sample["input_ids"]).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(random_sample["attention_mask"]).unsqueeze(0).to(model.device)

    # Create global attention mask (for LED, often the first token is set for global attention)
    global_attention_mask = torch.zeros_like(input_ids)
    global_attention_mask[:, 0] = 1  # Apply global attention on the first token

    # Generate prediction
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,  # Use global attention mask
        max_new_tokens=256,
        num_beams=2,
        no_repeat_ngram_size=3,
        early_stopping=True
    )[0]

    # Decode and print the generated summary
    generated_summary = tokenizer.decode(output_ids, skip_special_tokens=True)

    # Print the generated summary
    print("\nGenerated Summary for Random Sample:")
    print(generated_summary)

    # If you want to print the reference summary (label) as well
    label_ids = random_sample["labels"]
    if not isinstance(label_ids, list):
        label_ids = label_ids.tolist()

    safe_label_ids = [id for id in label_ids if isinstance(id, int) and 0 <= id < tokenizer.vocab_size]
    reference_summary = tokenizer.decode(safe_label_ids, skip_special_tokens=True)

    print("\nReference Summary (Label):")
    print(reference_summary)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    device_map={"": 0} #"auto"
).half()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

evaluate_model(model, test_dataset, tokenizer, "Pretrained")


Pretrained ROUGE Scores:
rouge1: 25.82
rouge2: 5.6
rougeL: 14.68
rougeLsum: 17.87


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.02 seconds, 50.06 sentences/sec

Pretrained BERTScore:
BERTScore-P: 82.39
BERTScore-R: 83.94
BERTScore-F1: 83.15


{'rouge1': np.float64(25.82),
 'rouge2': np.float64(5.6),
 'rougeL': np.float64(14.68),
 'rougeLsum': np.float64(17.87),
 'BERTScore-P': 82.39,
 'BERTScore-R': 83.94,
 'BERTScore-F1': 83.15}

In [None]:
print_random_summary(model, test_dataset[0], tokenizer)


Generated Summary for Random Sample:
 we investigate how the cost and effort required to implement the GDPR is viewed by workers who have also experienced the regulations’ benefits as citizens . 
 we find that the very people who comply with and execute the regulation consider it to be positive for their company , positive for privacy and not a pointless, bureaucratic regulation . 

Reference Summary (Label):
The General Data Protection Regulation (GDPR) is a set of rules created by the European Union to protect people's personal information. Many companies had to change how they handled data to follow these rules. This study looked at how employees feel about the costs and benefits of implementing GDPR. The researchers surveyed people who worked at the same companies before, during, and after GDPR was put in place. They wanted to understand if the employees saw GDPR as a good thing or a waste of time and money. The survey found that the employees recognized their rights under GDPR, b

In [None]:
ft_model = PeftModel.from_pretrained(model, "./led-lora-arxiv-linear_with_wt_decay")
ft_tokenizer = AutoTokenizer.from_pretrained("./led-lora-arxiv-linear_with_wt_decay")

evaluate_model(ft_model, test_dataset, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 25.72
rouge2: 5.41
rougeL: 14.08
rougeLsum: 17.71


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.96 seconds, 52.96 sentences/sec

Fine-tuned BERTScore:
BERTScore-P: 82.53
BERTScore-R: 83.92
BERTScore-F1: 83.2


{'rouge1': np.float64(25.72),
 'rouge2': np.float64(5.41),
 'rougeL': np.float64(14.08),
 'rougeLsum': np.float64(17.71),
 'BERTScore-P': 82.53,
 'BERTScore-R': 83.92,
 'BERTScore-F1': 83.2}

In [None]:
print_random_summary(ft_model, test_dataset[0], ft_tokenizer)

## Hyper Parameters and Run Summary

| Experiment       | r  | lora_alpha | target_modules     | lora_dropout | bias | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum |
|------------------|----|------------|--------------------|--------------|------|---------|---------|---------|-------------|
| Pretrained       | –  | –          | –                  | –            | –    | 26.42   | 5.92    | 13.32   | 18.32       |
| Fine-tuned #1.half()    | 8  | 16         | ["query", "value"] | 0.1          | none | 26.3    | 5.59    | 13.65   | 18.64       |
| Pretrained       | –  | –          | –                  | –            | –    | 22.02   | 5.79    | 12.02   | 15.74       |
| Fine-tuned #2 (4bit)   | 8  | 16         | ["query", "value", "key", "output"] | 0.0          | none | 21.17    | 5.71    | 11.53   | 15.06      |
| Pretrained       | –  | –          | –                  | –            | –    | 26.83   | 4.52     | 14.53   | 17.17       |
| Fine-tuned.half() #3 Global Attention    | 8  | 16         | ["query", "value", "key", "output"] | 0.0          | none | 26.79    | 4.61    | 14.56   | 17.21      |
| Pretrained       | –  | –          | –                  | –            | –    | 29.1   | 3.97     | 14.96   | 18.74       |
| Fine-tuned.half() #3 Global Attention    | 8  | 16         | ["query", "value", "key", "output"] | 0.0          | none | 28.69    | 4.87    | 15.09   | 18.82      |
| Pretrained       | –  | –          | –                  | –            | –    | 28.48   | 4.34     | 14.29   | 16.73       |
| Fine-tuned.half() #3 Global Attention    | 16  | 16         | ["query", "value", "key", "output"] | 0.0          | none | 27.86    | 4.05    | 14.29   | 16.54      |

#Rough Work - Ignore

##Zip/Save Finetuned model

In [None]:
import shutil

shutil.make_archive('led-lora-arxiv', 'zip', 'led-lora-arxiv')

'/content/led-lora-arxiv.zip'

In [None]:
test_dataset[1]['plain_english_explanation']

'The paper investigates whether a Transformer model, a type of artificial intelligence algorithm, can learn to perform the same tasks as a Kalman filter , a widely used algorithm for state estimation and filtering. Kalman filters are commonly used in applications like navigation, control systems, and signal processing to estimate the state of a system based on noisy measurements. The authors explore the connections between Transformers and Kalman filters, and whether Transformers can learn to represent the dynamics of linear systems in the same way that Kalman filters do. They provide both theoretical and empirical analyses to understand the representational power of Transformers and their ability to capture the same properties as Kalman filters. This research is important because it helps to understand the capabilities and limitations of Transformer models, and whether they can be used as a substitute for traditional algorithms like Kalman filters in certain applications. If Transform

In [None]:
from transformers import LEDForConditionalGeneration, LEDTokenizer
import torch

tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

input_ids = tokenizer(test_dataset[0]["pdf_text"], return_tensors="pt").input_ids.to("cuda")
global_attention_mask = torch.zeros_like(input_ids)
# set global_attention_mask on first token
global_attention_mask[:, 0] = 1

model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv", return_dict_in_generate=True).to("cuda")

sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences

summary = tokenizer.batch_decode(sequences)
print(summary)

['</s> state-of-the-art ( sOTA ) deep learning models all share a common characteristic : they all have an extremely large number of parameters ( 10s if not 100 s of billions ) . \n most practitioners are interested in using such models for specific tasks and want to adapt these models to a new , generally smaller task . \n this procedure is known as finetuning , where one adjusts the weights of the pretrained model to improve performance on the new task . \n however , due to the size of sOTA models , adapting to down-stream tasks with full finetuning (finetuning all model parameters ) is computationally infeasible as it requires modifying the weights of the pretrained models using gradient methods , which is a costly process . \n a variety of resource-efficient finetuning methods have been proposed such as prompt tuning ( lester et al., 2021 ) where a soft prompt is learned and appended to the input , the adapters method ( houlsby et al., 2019 ) where lightweight lightweight layers ar

In [None]:
dataset1 = load_dataset("ccdv/arxiv-summarization", split="test[:10]")

README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

train-00000-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00001-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00002-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00003-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00004-of-00015.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00005-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00006-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00007-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00008-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00009-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00010-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00011-of-00015.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00012-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00013-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00014-of-00015.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [None]:
dataset1["article"][0]

'for about 20 years the problem of properties of short - term changes of solar activity has been considered extensively . \n many investigators studied the short - term periodicities of the various indices of solar activity . \n several periodicities were detected , but the periodicities about 155 days and from the interval of @xmath3 $ ] days ( @xmath4 $ ] years ) are mentioned most often . \n first of them was discovered by @xcite in the occurence rate of gamma - ray flares detected by the gamma - ray spectrometer aboard the _ solar maximum mission ( smm ) . \n this periodicity was confirmed for other solar flares data and for the same time period @xcite . \n it was also found in proton flares during solar cycles 19 and 20 @xcite , but it was not found in the solar flares data during solar cycles 22 @xcite . \n _    several autors confirmed above results for the daily sunspot area data . @xcite studied the sunspot data from 18741984 . \n she found the 155-day periodicity in data reco

In [None]:
dataset1["abstract"][0]

'the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data \n the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . \n a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . \n it proves against the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity 

In [None]:
input_ids = tokenizer(dataset1["article"][0], return_tensors="pt").input_ids.to("cuda")
global_attention_mask = torch.zeros_like(input_ids)
# set global_attention_mask on first token
global_attention_mask[:, 0] = 1

sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences

summary = tokenizer.batch_decode(sequences)
print(summary)

Input ids are automatically padded from 8050 to 8192 to be a multiple of `config.attention_window`: 1024


['</s> the problem of the existence of the periodicity of about @xmath0 days of the time series of sunspot area fluctuations and sunspot area fluctuations from the northern hemisphere during the maximum activity period is considered .  \n the autocorrelation analysis of the time series of sunspot area fluctuations from the southern hemisphere indicates that periodicity of about 155 days exists during the maximum activity period .  \n the results obtained for the time series of sunspot area fluctuations from the maximum activity period are contradict with the conclusions of @xcite .  \n the autocorrelation analysis of the time series of sunspot area fluctuations from the southern hemisphere indicates that periodicity of about 155 days exists during the maximum activity period .  \n the periodogram of daily sunspot fluctuations contains peaks , which could be harmonics or subharmonics of the true periodicities .  \n this effect is not visible for sunspot data of the one rotation time int

In [None]:
# Evaluation metric
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
ft_model = PeftModel.from_pretrained(model, "./led-lora-arxiv")
ft_model = ft_model.merge_and_unload()
ft_tokenizer = AutoTokenizer.from_pretrained("./led-lora-arxiv")

evaluate_model(ft_model, test_dataset, ft_tokenizer, "Fine-tuned")


Fine-tuned ROUGE Scores:
rouge1: 27.04
rouge2: 4.95
rougeL: 14.44
rougeLsum: 17.13
