<a href="https://colab.research.google.com/github/NjagiKevin/Airflow-Astro-ETL-Pipeline/blob/main/abstractive_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RESUME WORK QUICKLY: Setup environment and reload everything

from google.colab import drive
drive.mount('/content/drive')

from transformers import PegasusTokenizer, AutoModelForSeq2SeqLM
from datasets import load_from_disk
import torch
from nltk.tokenize import sent_tokenize

# Load tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("/content/drive/MyDrive/pegasus-wikilingua-best")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/pegasus-wikilingua-best")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load tokenized datasets
train = load_from_disk("/content/drive/MyDrive/tokenized_data/train")
val   = load_from_disk("/content/drive/MyDrive/tokenized_data/validation")
test  = load_from_disk("/content/drive/MyDrive/tokenized_data/test")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install evaluate datasets --upgrade rouge_score



In [None]:
# import os
# import shutil

# # Loop through all files/folders in /content
# for item in os.listdir("/content"):
#     item_path = os.path.join("/content", item)

#     # Skip Google Drive mount
#     if item == "drive":
#         continue

#     try:
#         if os.path.isfile(item_path):
#             os.remove(item_path)
#         else:
#             shutil.rmtree(item_path)
#     except Exception as e:
#         print(f"❌ Could not delete {item_path}: {e}")


## Import and Setup

In [None]:
import warnings
from datasets import load_dataset, Dataset,load_from_disk, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM,
    PegasusTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    TrainerCallback,
)
from nltk.tokenize import sent_tokenize
import nltk
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import islice
from evaluate import load as load_metric
from google.colab import drive
from huggingface_hub import login
import getpass
import wandb
import torch
# Suppress tqdm warnings
warnings.filterwarnings("ignore", category=UserWarning, module="tqdm")
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

## Login and Download

In [None]:
nltk.download("punkt")

# Hugging Face login
hf_token = getpass.getpass("Enter your Hugging Face token: ")
login(token=hf_token)

# Mount Google Drive for persistent storage
drive.mount('/content/drive')
CACHE_DIR = "/content/drive/MyDrive/tokenized_data"


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter your Hugging Face token: ··········
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load and Subset Dataset

In [None]:
dataset = load_dataset("GEM/wiki_lingua", "en")

# Take only a small subset for training, validation, and testing
train_dataset = dataset['train'].select(range(1000))
val_dataset   = dataset['validation'].select(range(500))
test_dataset  = dataset['test'].select(range(500))

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train: 1000, Val: 500, Test: 500


## Tokenizer

In [None]:
model_checkpoint = "google/pegasus-large"
tokenizer_path = "./cached_tokenizers/pegasus-large"

if os.path.exists(tokenizer_path):
    print("✅ Loading tokenizer from local cache...")
    tokenizer = PegasusTokenizer.from_pretrained(tokenizer_path)
else:
    print("⬇️ Downloading tokenizer...")
    tokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)
    tokenizer.save_pretrained(tokenizer_path)

max_input_length = 512
max_target_length = 30

⬇️ Downloading tokenizer...


In [None]:
def preprocess_function(examples):
    sources = [str(x) for x in examples["source"]]
    targets = [str(x) for x in examples["target"]]

    model_inputs = tokenizer(
        sources,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

## Tokenize and Cache

In [None]:
# Tokenize & save or load from cache
if (
    os.path.exists(f"{CACHE_DIR}/train") and
    os.path.exists(f"{CACHE_DIR}/validation") and
    os.path.exists(f"{CACHE_DIR}/test")
):
    print("✅ Loading tokenized datasets from disk...")
    tokenized_datasets = {
        "train": load_from_disk(f"{CACHE_DIR}/train"),
        "validation": load_from_disk(f"{CACHE_DIR}/validation"),
        "test": load_from_disk(f"{CACHE_DIR}/test")
    }
else:
    print("🔄 Tokenizing datasets and saving to disk...")

    tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["source", "target"])
    tokenized_val   = val_dataset.map(preprocess_function, batched=True, remove_columns=["source", "target"])
    tokenized_test  = test_dataset.map(preprocess_function, batched=True, remove_columns=["source", "target"])

    tokenized_train.save_to_disk(f"{CACHE_DIR}/train")
    tokenized_val.save_to_disk(f"{CACHE_DIR}/validation")
    tokenized_test.save_to_disk(f"{CACHE_DIR}/test")

    tokenized_datasets = {
        "train": tokenized_train,
        "validation": tokenized_val,
        "test": tokenized_test
    }

print("✅ Tokenized datasets are ready.")

✅ Loading tokenized datasets from disk...
✅ Tokenized datasets are ready.


## Lead-3 Baseline ROUGE Evaluation

In [None]:
from evaluate import load
from nltk.tokenize import sent_tokenize
import pandas as pd

# Load ROUGE metric using the correct library
rouge = load("rouge")

# Helper: extract first 3 sentences
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text.strip())[:3])

# Evaluation using simple Lead-3 baseline
def evaluate_baseline(data):
    # Convert to DataFrame if it's a Dataset
    df = pd.DataFrame(data) if isinstance(data, dict) else data

    # Apply baseline summarizer
    preds = [three_sentence_summary(str(src)) for src in df["source"]]
    refs  = [str(tgt) for tgt in df["target"]]

    # Compute ROUGE
    results = rouge.compute(predictions=preds, references=refs)

    # Extract F1 scores from results
    return {
        k: round(results[k] * 100, 2)
        for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]
    }

# Run baseline on validation split
val_df = dataset["validation"].select(range(500)).to_pandas()  # reusing your small slice
baseline_scores = evaluate_baseline(val_df)

# Display nicely
rouge_df = pd.DataFrame.from_dict(baseline_scores, orient='index', columns=["Lead-3 F1 Score (%)"])
rouge_df.index.name = "ROUGE Type"

print("\n📊 Lead-3 Baseline ROUGE Scores:")
print(rouge_df)



📊 Lead-3 Baseline ROUGE Scores:
            Lead-3 F1 Score (%)
ROUGE Type                     
rouge1                    22.66
rouge2                     5.59
rougeL                    15.22
rougeLsum                 17.13


## Load Pegasus Model (cached)

In [None]:
def load_or_cache_model_and_tokenizer(model_checkpoint, cache_base_dir="/content/drive/MyDrive"):
    # Short name for model (e.g. "pegasus-large")
    model_name = model_checkpoint.split("/")[-1]

    # Define local cache paths
    local_model_path = os.path.join(cache_base_dir, "cached_models", model_name)
    local_tokenizer_path = os.path.join(cache_base_dir, "cached_tokenizers", model_name)

    # Create directories if they don't exist
    os.makedirs(local_model_path, exist_ok=True)
    os.makedirs(local_tokenizer_path, exist_ok=True)

    # Load tokenizer
    if os.listdir(local_tokenizer_path):
        print("✅ Loading tokenizer from cache...")
        tokenizer = PegasusTokenizer.from_pretrained(local_tokenizer_path)
    else:
        print("⬇️ Downloading tokenizer...")
        tokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)
        tokenizer.save_pretrained(local_tokenizer_path)

    # Load model
    if os.listdir(local_model_path):
        print("✅ Loading model from cache...")
        model = AutoModelForSeq2SeqLM.from_pretrained(local_model_path)
    else:
        print("⬇️ Downloading model...")
        model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
        model.save_pretrained(local_model_path)

    return tokenizer, model

In [None]:
# Use the function
model_checkpoint = "google/pegasus-xsum"
tokenizer, model = load_or_cache_model_and_tokenizer(model_checkpoint)

⬇️ Downloading tokenizer...


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

⬇️ Downloading model...


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]



## Training Setup

In [None]:
# Ensure logging directory exists
os.makedirs("./logs", exist_ok=True)

batch_size = 1
num_train_epochs = 5  # Reduced for quicker testing
gradient_accumulation_steps = 4
model_name = model_checkpoint.split("/")[-1]
logging_steps = max(1, len(tokenized_datasets["train"]) // batch_size)


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-wikilingua",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    gradient_accumulation_steps=gradient_accumulation_steps,     # Simulates a batch size of 4
    logging_steps=logging_steps,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_dir="./logs",
    run_name = "megasus-xsum-finetuned-wikilingua-v1"  # Unique identifier
)

In [None]:
# Load ROUGE metric properly
rouge_score = load_metric("rouge")

In [None]:
# Set up early stopping
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

In [None]:
# Define compute_metrics function with W&B logging
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 with pad_token_id in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Sentence formatting for ROUGE
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Handle different return types (either float or object)
    result = {k: (v.mid.fmeasure * 100 if hasattr(v, "mid") else v * 100)
              for k, v in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}

    # Log to Weights & Biases
    wandb.log(result)

    return result

## Trainer & Training

In [None]:
torch.cuda.empty_cache()

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

## Start training

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkevinnjagi83[0m ([33mnjagi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.367954,30.4553,11.3834,25.2269,28.5632
2,No log,3.237756,33.491,12.8734,28.0501,31.7266
3,No log,3.184441,35.4595,13.9386,29.6858,33.7378
4,3.302200,3.150419,36.0957,14.598,30.1525,34.3278
5,3.302200,3.140076,35.8655,14.3847,30.197,34.2378


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1250, training_loss=3.236039599609375, metrics={'train_runtime': 6598.3351, 'train_samples_per_second': 0.758, 'train_steps_per_second': 0.189, 'total_flos': 7223661035520000.0, 'train_loss': 3.236039599609375, 'epoch': 5.0})

## Save Model and Tokenizer to Google Drive

In [None]:
model_path = "/content/drive/MyDrive/pegasus-wikilingua-best"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/pegasus-wikilingua-best/tokenizer_config.json',
 '/content/drive/MyDrive/pegasus-wikilingua-best/special_tokens_map.json',
 '/content/drive/MyDrive/pegasus-wikilingua-best/spiece.model',
 '/content/drive/MyDrive/pegasus-wikilingua-best/added_tokens.json')

## Log Predictions to W&B

In [None]:
def log_predictions_to_wandb(n=5, split="validation"):
    table = wandb.Table(columns=["Source", "Target", "Prediction"])
    sample_batch = dataset[split].select(range(n))

    inputs = tokenizer(
        sample_batch["source"],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            num_beams=4
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for src, tgt, pred in zip(sample_batch["source"], sample_batch["target"], preds):
        table.add_data(src[:300], tgt, pred)

    wandb.log({"Sample Predictions": table})


## Download the Model from Google Drive

In [None]:
!zip -r /content/pegasus-wikilingua-best.zip /content/drive/MyDrive/pegasus-wikilingua-best

  adding: content/drive/MyDrive/pegasus-wikilingua-best/ (stored 0%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/config.json (deflated 61%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/generation_config.json (deflated 42%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/tokenizer_config.json (deflated 94%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/special_tokens_map.json (deflated 82%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/spiece.model (deflated 50%)
  adding: content/drive/MyDrive/pegasus-wikilingua-best/training_args.bin (deflated 51%)


In [None]:
from google.colab import files
files.download("/content/pegasus-wikilingua-best.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>