In [1]:
import os
import json
os.environ["WANDB_DISABLED"] = "true"
import sys
import logging
import transformers
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
import nltk
import numpy as np
transformers.logging.set_verbosity_error()

from tqdm import tqdm
from transformers import (HfArgumentParser, EvalPrediction, DataCollatorForSeq2Seq, set_seed)
from torch.nn.utils.rnn import pad_sequence
from deepspeed.profiling.flops_profiler import get_model_profile
from datasets import load_metric
from models.model_args import ModelArguments
from utils.utils import *
from utils.minus_utils import efficiency_testing, input_constructor, compare_parameters
from utils.analysis_utils import gen_run_report
from trainer.trainer_seq2seq_minus import MinusSeq2SeqTrainer
from args import MinusTrainingArguments, Seq2SeqDataTrainingArguments
from loralib.layers import LoRALayer
from models import build_model


  from .autonotebook import tqdm as notebook_tqdm


[2025-04-26 10:30:20,419] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/petrik/miniforge3/envs/apt/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
from datasets import DatasetDict, load_dataset


dataset_name = "Plasmoxy/gigatrue"
dataset = load_dataset(dataset_name)

# Remove unnecessary columns if present
for split in dataset.keys():
    if "article_len_approx" in dataset[split].column_names:
        dataset[split] = dataset[split].remove_columns("article_len_approx")
    if "summary_len_approx" in dataset[split].column_names:
        dataset[split] = dataset[split].remove_columns("summary_len_approx")

# Halved validation set option
if True:
    # Split 'val' into two halves
    val_split = dataset['validation']
    half_index = len(val_split) // 2
    validation_split = val_split.select(range(half_index))
    test_split = val_split.select(range(half_index, len(val_split)))

    # Create a new DatasetDict with the updated splits
    dataset = DatasetDict({
        'train': dataset['train'],
        'validation': validation_split,
        'test': test_split,
        'test_original': dataset['test']
    })

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'summary', 'orig_idx'],
        num_rows: 3783821
    })
    validation: Dataset({
        features: ['article', 'summary', 'orig_idx'],
        num_rows: 94405
    })
    test: Dataset({
        features: ['article', 'summary', 'orig_idx'],
        num_rows: 94406
    })
    test_original: Dataset({
        features: ['article', 'summary', 'orig_idx'],
        num_rows: 1822
    })
})


In [3]:
model_path = 'output/bubi'

In [4]:
model_args = ModelArguments(
    model_name_or_path=model_path,
    apply_lora=True,
    lora_alpha=16,
    lora_r=8,  # from script argument lora_r
    use_fast_tokenizer=True,
    model_revision="main",
    use_auth_token=False,
    do_auto_pruning=False
)

data_args = Seq2SeqDataTrainingArguments(
    task_name="gigatrue",
    max_input_length=110,
    max_target_length=35,
)

training_args = MinusTrainingArguments(
    output_dir=model_path,
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-4,
    weight_decay=0.1,
    warmup_ratio=0.06,
    bf16=True,
    lr_scheduler_type="linear",
    save_strategy="no",
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=5000,
    logging_steps=1000,
    
    # Minus specific arguments
    adapter_type="lora",
    minus_scheduler=True,
    mac_constraint=0.4,
    pruning_scheduler="cubic_gradual",
    pruning_scheduler_strategy="saliency",
    param_allocation_strategy="running_fisher",
    param_resizing_strategy="tophalf_limited",
    pruning_frequency=-1,
    num_prunings=10,
    pruning_batch_size=64,
    pruning_batches=8,
    pruning_start=-1,
    pruning_stop=2,
    # pre_pruning_tuning_steps=200,
    # sparsity_warmup_epochs=1,
    head_scorer_type="gradient_l2",
    intermediate_scorer_type="gradient_l2",
    pruner_type="running_fisher",
    distillation_type="self_momentum",
    distill_mapping_strategy="dynamic_block_teacher_dynamic_student",
    do_distill=True,
    do_virtual_prune=True,
    distill_start=-1,
    distill_epoch=5,
    mask_lr=0.01,
    grafting_top_k=-1,
    collect_salience=True,
    salience_collecting_start=200,
    salience_collecting_end=-1,
    teacher_param_tuning_config="eq:0-5,ev:0-5,dq:0-5,dv:0-5,cq:0-5,cv:0-5,ei:0-5,di:0-5",
    student_param_tuning_config="eq:0-5,ev:0-5,dq:0-5,dv:0-5,cq:0-5,cv:0-5,ei:0-5,di:0-5",
    # warmup_param_tuning_config="eq:0-5,ev:0-5,dq:0-5,dv:0-5,cq:0-5,cv:0-5,ei:0-5,di:0-5",
    tuning_expanding_ratio=4.0,
    max_lora_r=64,  # lora_r * 8
    report_to="none",
    seed=128
)


Grafting mask learning rate is set to be the same as mask learning rate.


In [5]:
config, tokenizer, model = build_model(model_args, data_args, training_args, determined_model_path=model_path)

Config:  T5Config {
  "_name_or_path": "output/bubi",
  "adap_pruned_heads": {
    "cross": {
      "0": [],
      "1": [],
      "2": [],
      "3": [],
      "4": [],
      "5": []
    },
    "decoder": {
      "0": [],
      "1": [],
      "2": [],
      "3": [],
      "4": [],
      "5": []
    },
    "encoder": {
      "0": [],
      "1": [],
      "2": [],
      "3": [],
      "4": [],
      "5": []
    }
  },
  "apply_lora": true,
  "architectures": [
    "AdaPT5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "do_distill": true,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "lora_alpha": 16,
  "lora_r": 8,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "out

In [6]:
model.eval()

AdaPT5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): AdaPT5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): AdaPT5Block(
        (layer): ModuleList(
          (0): AdaPT5LayerSelfAttention(
            (SelfAttention): AdaPT5Attention(
              (q): DistillLinear(in_features=512, out_features=512, bias=False)
                lora_A(in_features=512, out_features=55)
                lora_B(in_features=55, out_features=512)
              )
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): DistillLinear(in_features=512, out_features=512, bias=False)
                lora_A(in_features=512, out_features=29)
                lora_B(in_features=29, out_features=512)
              )
              (o): SelectLinear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): AdaPT5LayerNorm()
            (

In [49]:
model_spec_dict = {
    "name": model_path,
    "params": model.num_parameters(),
    "params_trainable": model.num_parameters(only_trainable=True),
    "params_encoder": sum(p.numel() for p in model.encoder.parameters()),
    "params_decoder": sum(p.numel() for p in model.decoder.parameters()),
    "params_noembed": model.num_parameters(exclude_embeddings=True),
    "mem_footprint": model.get_memory_footprint(),
    "dtype": model.dtype,
    "device": model.device,
    "type": str(type(model)),
}
model_spec_dict

{'name': 'output/bubi',
 'params': 68489548,
 'params_trainable': 36136268,
 'params_encoder': 30690456,
 'params_decoder': 36900020,
 'params_noembed': 52039500,
 'mem_footprint': 273958192,
 'dtype': torch.float32,
 'device': device(type='cpu'),
 'type': "<class 'models.modeling_t5.AdaPT5ForConditionalGeneration'>"}

In [16]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 36136268
all model parameters: 68489548
percentage of trainable model parameters: 52.76%


In [20]:
device = 'cpu'

In [37]:
def gigatrue_preprocess_t5(batch, tokenizer, max_input_length, max_target_length, padding="max_length", lang="en"):
    # add prefix to the input for t5
    if lang == "en":
        inputs = ["summarize: " + item for item in batch["article"]]
    elif lang == "sk":
        inputs = ["[SK] sumarizuj: " + item for item in batch["article"]]
 
    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=padding, truncation=True)
 
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=batch["summary"], max_length=max_target_length, padding=padding, truncation=True)
 
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [41]:
print("Tokenizing test dataset...")

ds_tokenized = dataset['test'].map(
    gigatrue_preprocess_t5,
    batched=True,
    remove_columns=["orig_idx"],
    fn_kwargs=dict(
        tokenizer=tokenizer,
        padding="max_length",
        lang='en',
        max_input_length=data_args.max_input_length,
        max_target_length=data_args.max_target_length
    )
)

ds_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'article', 'summary'])

sample = ds_tokenized[0]
print("Input shape:", sample['input_ids'].shape)
print("Labels shape:", sample['labels'].shape)
    
ds_tokenized

Tokenizing test dataset...
Input shape: torch.Size([110])
Labels shape: torch.Size([35])


Dataset({
    features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 94406
})

In [51]:
def generate_summaries(model, tokenizer, dataset, device, batch_size=1):
    # Initialize a list to store all summaries generated by the model
    all_summaries = []
    # Create a DataLoader for the dataset
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    current_batch = 0
    
    print("Generating summaries...")
    
    # Iterate over the dataset
    for batch in tqdm(dataloader):
        current_batch += 1
        
        # Generate summaries
        summaries = model.generate(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            max_length=data_args.max_target_length,
        )
        
        # Decode the generated summaries
        decoded_summaries = tokenizer.batch_decode(summaries, skip_special_tokens=True)
        
        # Store the decoded summaries
        all_summaries.extend(decoded_summaries)
        
    return all_summaries


# Generate summaries for test set
print("Generating summaries for test set...")
generated_summaries = generate_summaries(
    model, 
    tokenizer,
    ds_tokenized,
    device,
    batch_size=256,
)

# Print a sample comparison
print("\nSample comparison:")
print("Original:", ds_tokenized[0]['summary'])
print("Generated:", generated_summaries[0])

Generating summaries for test set...
Generating summaries...


  0%|          | 1/369 [00:07<46:20,  7.55s/it]


KeyboardInterrupt: 