In [None]:
# Install required libraries
!pip install transformers datasets sacrebleu evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Shuffle and reduce the dataset size
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))  # 5000 training examples
small_validation_dataset = dataset["validation"].shuffle(seed=42).select(range(1000))  # 1000 validation examples
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))  # 1000 test examples

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

max_input_length = 512
max_target_length = 150

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Tokenize the summaries as targets
    labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = small_validation_dataset.map(preprocess_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

# ... (your existing code for loading and preprocessing the dataset) ...

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Use Seq2SeqTrainingArguments instead of TrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",                # Directory to save model checkpoints
    evaluation_strategy="epoch",          # Evaluate after each epoch
    learning_rate=5e-5,                    # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    weight_decay=0.01,                     # Weight decay
    save_total_limit=2,                   # Limit saved checkpoints
    num_train_epochs=3,                   # Number of epochs
    predict_with_generate=True,           # Generate text during evaluation
    logging_dir="./logs",                 # Log directory
    logging_steps=100,                    # Log every 100 steps
    save_strategy="epoch",                # Save model after each epoch
    load_best_model_at_end=True           # Load the best model after training
)

# Define a data collator with padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,  # Pass the model to the data collator
    padding=True,
    max_length=max_input_length,  # Use the same max_length as in preprocessing
    pad_to_multiple_of=8,  # Optional: Pad to a multiple of 8 for better performance
)

# Modify the Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Add the data collator
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.125,1.793008
2,2.0592,1.786286
3,1.9991,1.783158


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1875, training_loss=2.06210634765625, metrics={'train_runtime': 745.9551, 'train_samples_per_second': 20.108, 'train_steps_per_second': 2.514, 'total_flos': 2030127022080000.0, 'train_loss': 2.06210634765625, 'epoch': 3.0})

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the path where the model will be saved
save_path = "/content/drive/My Drive/model2"

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")


Model saved to /content/drive/My Drive/model2
