In [None]:
!pip install transformers datasets torch accelerate evaluate



In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import evaluate
import numpy as np

In [None]:
dataset = load_dataset("multi_news")

train_ds = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]) * 0.05)))  # 5% of data
eval_ds = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]) * 0.05)))  # 5% of data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model_name = "sshleifer/distilbart-cnn-12-6"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["document"],
        padding="longest",  # Dynamically pads to longest sequence in batch
        max_length=1024,
        truncation=True,
    )
    targets = tokenizer(
        examples["summary"],
        padding="longest",
        max_length=150,
        truncation=True,
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_eval = eval_ds.map(preprocess_function, batched=True)

In [None]:
# Remove extra columns that are not inputs for the model
tokenized_train = tokenized_train.remove_columns(["summary", "document"])
tokenized_eval = tokenized_eval.remove_columns(["summary", "document"])

In [None]:
# Load BLEU metric
bleu = evaluate.load("bleu")

def compute_bleu(pred):
    predictions, labels = pred

    # Convert logits to token IDs using argmax
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Extract first element

    # Ensure predictions are NumPy arrays
    if isinstance(predictions, np.ndarray):
        predictions = np.argmax(predictions, axis=-1)  # Convert logits to token IDs
        predictions = predictions.tolist()  # Convert to list

    # Convert labels to lists
    labels = labels.tolist() if isinstance(labels, np.ndarray) else labels

    # Decode into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    references = [[ref] for ref in tokenizer.batch_decode(labels, skip_special_tokens=True)]

    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=references)

    return {"bleu": result["bleu"]}

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="distilbart-finetuned-multinews",
    run_name="distilbart-multi-news-run",
    report_to="none",  # Disable wandb
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    push_to_hub=False,
    eval_accumulation_steps=4  # Reduce memory usage
)



In [None]:
# Ensure model is on GPU
model.to("cuda")

# Check dataset device
print("Debug: Checking dataset tensors on GPU")
print("Sample Tokenized Train Data:", tokenized_train[0])

# Test a forward pass before training
inputs = tokenizer("This is a test input.", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs)
print("Debug: Model test passed. Output:", tokenizer.batch_decode(outputs, skip_special_tokens=True))

Debug: Checking dataset tensors on GPU
Sample Tokenized Train Data: {'input_ids': [0, 45717, 37053, 34080, 14452, 18, 1354, 34, 555, 10, 299, 6794, 29322, 1741, 18, 665, 3653, 4, 1437, 50118, 1437, 50118, 16083, 1729, 6, 365, 6, 3790, 4248, 975, 226, 3586, 3765, 510, 6433, 18, 5168, 21425, 94, 186, 6, 511, 11, 5, 18424, 9, 27557, 2191, 4444, 6, 6823, 2068, 17251, 6178, 27757, 104, 6, 344, 22722, 7025, 2076, 226, 23075, 1301, 8, 26204, 975, 13548, 725, 28889, 6997, 4581, 4, 1437, 50118, 1437, 50118, 1405, 80, 5396, 6, 4729, 2444, 8041, 29615, 37053, 8, 12413, 10227, 3935, 6, 4005, 19, 49, 809, 26484, 150, 2201, 56, 69, 2549, 15158, 6, 847, 8, 25845, 71, 69, 20941, 58, 10122, 909, 4, 1437, 50118, 1437, 50118, 38659, 1437, 50118, 1437, 50118, 20, 21425, 6, 2034, 15, 4448, 14170, 2666, 6, 34, 10, 7397, 7266, 8, 16, 98, 5451, 24, 630, 75, 240, 10, 1203, 4, 1437, 50118, 1437, 50118, 14474, 6218, 8289, 23, 5, 276, 86, 25, 2201, 1602, 5, 13692, 25, 22, 1694, 8602, 845, 1437, 50118, 1437, 50118

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_bleu
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,2.7093,2.522746,0.208086
2,2.3245,2.503051,0.210742
3,2.1291,2.524327,0.212036




TrainOutput(global_step=1686, training_loss=2.3493362372456073, metrics={'train_runtime': 792.3731, 'train_samples_per_second': 8.511, 'train_steps_per_second': 2.128, 'total_flos': 1.0439119104638976e+16, 'train_loss': 2.3493362372456073, 'epoch': 3.0})

In [None]:
trainer.save_model("distilbart-finetuned-multinews")
tokenizer.save_pretrained("distilbart-finetuned-multinews")

('distilbart-finetuned-multinews/tokenizer_config.json',
 'distilbart-finetuned-multinews/special_tokens_map.json',
 'distilbart-finetuned-multinews/vocab.json',
 'distilbart-finetuned-multinews/merges.txt',
 'distilbart-finetuned-multinews/added_tokens.json',
 'distilbart-finetuned-multinews/tokenizer.json')

In [None]:
from google.colab import files
!zip -r distilbart-finetuned-multinews.zip distilbart-finetuned-multinews
files.download("distilbart-finetuned-multinews.zip")

  adding: distilbart-finetuned-multinews/ (stored 0%)
  adding: distilbart-finetuned-multinews/merges.txt (deflated 53%)
  adding: distilbart-finetuned-multinews/generation_config.json (deflated 47%)
  adding: distilbart-finetuned-multinews/config.json (deflated 62%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/ (stored 0%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/generation_config.json (deflated 47%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/config.json (deflated 62%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/rng_state.pth (deflated 25%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/scheduler.pt (deflated 56%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/training_args.bin (deflated 52%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/model.safetensors (deflated 7%)
  adding: distilbart-finetuned-multinews/checkpoint-1686/optimizer.pt (deflated 9%)
  adding: distilbart-finetuned-multinews/checkpoi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls -lh distilbart-finetuned-multinews.zip

-rw-r--r-- 1 root root 11G Mar 14 07:33 distilbart-finetuned-multinews.zip


sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')