In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define the models we want to compare
model_checkpoints = {
    "BART": "facebook/bart-base",
    "T5": "t5-small",
    "PEGASUS": "google/pegasus-xsum"
}

# Dictionaries to store our loaded models and tokenizers
tokenizers = {}
models = {}

print("--- Loading Models and Tokenizers ---")

for model_name, checkpoint in model_checkpoints.items():
    print(f"Loading {model_name} from checkpoint: {checkpoint}")

    # Load the tokenizer
    tokenizers[model_name] = AutoTokenizer.from_pretrained(checkpoint)

    # Load the pre-trained model
    models[model_name] = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

print("\n--- All models and tokenizers loaded successfully! ---")
print("\nTokenizers loaded:", tokenizers.keys())
print("Models loaded:", models.keys())

In [None]:
from datasets import load_dataset, DatasetDict

# --- Re-load our data from Step 2 ---
# Define the direct URLs to the script-free Parquet files
data_files = {
    "train": "https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
    "validation": "https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet",
    "test": "https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet"
}

# Load the dataset by pointing directly to the Parquet files
print("Loading XSum dataset from direct Parquet URLs...")
dataset = load_dataset("parquet", data_files=data_files)

# --- Re-sample our data from Step 2 ---
train_sample = dataset['train'].shuffle(seed=42).select(range(5000))
validation_sample = dataset['validation'].shuffle(seed=42).select(range(1000))
test_sample = dataset['test'].shuffle(seed=42).select(range(1000))

# Create the final sampled DatasetDict
sampled_dataset = DatasetDict({
    'train': train_sample,
    'validation': validation_sample,
    'test': test_sample
})

print("\n--- Final Sampled DatasetDict ---")
print(sampled_dataset)

In [None]:
# Set max lengths for our input and output
# We truncate the article to 512 tokens and the summary to 128
max_input_length = 512
max_target_length = 128

def preprocess_function(examples, tokenizer, model_name):
    # --- Prepare Input ---
    # T5 models require a "summarize: " prefix
    if model_name == "T5":
        inputs = ["summarize: " + doc for doc in examples["document"]]
    else:
        inputs = examples["document"]
        
    # Tokenize the input documents
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True, 
        padding="longest"
    )

    # --- Prepare Target ---
    # Tokenize the target summaries (labels)
    # We use the 'as_target_tokenizer' context manager for this
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], 
            max_length=max_target_length, 
            truncation=True,
            padding="longest"
        )

    # The model needs the tokenized targets to be named 'labels'
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Preprocessing function defined.")

In [None]:
# A dictionary to store our three new tokenized datasets
tokenized_datasets = {}

print("--- Applying preprocessing to all 3 model types ---")

for model_name, tokenizer in tokenizers.items():
    print(f"Tokenizing data for {model_name}...")
    
    # We use .map() to apply our function to every example
    # We use a 'lambda' to pass in the extra 'tokenizer' and 'model_name' arguments
    tokenized_ds = sampled_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, model_name),
        batched=True  # Process examples in batches for speed
    )
    
    # Store the result
    tokenized_datasets[model_name] = tokenized_ds

print("\n--- Preprocessing complete! ---")
print("Tokenized datasets created:", list(tokenized_datasets.keys()))
print("\nExample of tokenized data (BART):")
print(tokenized_datasets["BART"]['train'][0])

In [None]:
print("--- Saving tokenized datasets to disk ---")

for model_name, dataset in tokenized_datasets.items():
    output_dir = f"./tokenized_data/{model_name}"
    print(f"Saving {model_name} dataset to {output_dir}")
    dataset.save_to_disk(output_dir)

print("\n--- All processed datasets saved! ---")