In [3]:
!pip install transformers datasets torch pandas scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Step 1: Load the dataset
data_path = "samsum-train.csv"  # Update with your file path
df = pd.read_csv(data_path)

# Handle missing values in the 'dialogue' column
df['dialogue'] = df['dialogue'].fillna("")  # Replace NaN with empty strings

# Step 2: Preprocess the data for abstractive summarization
def preprocess_data(row):
    """
    Prepare text-summary pairs for fine-tuning GPT-2.
    """
    dialogue = row['dialogue']
    # Using the first sentence as a naive summary (you can improve this logic with a better method)
    summary = dialogue.split('. ')[0] if len(dialogue.split('. ')) > 1 else dialogue
    return {"text": dialogue, "summary": summary}

# Apply preprocessing to the dataset
processed_data = df.apply(preprocess_data, axis=1)

# Convert processed data into a DataFrame
processed_df = pd.DataFrame(processed_data.tolist())

# Split the dataset into training and evaluation subsets
train_df, eval_df = train_test_split(processed_df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Step 3: Tokenize the data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a special token for separating input (text) and output (summary)
tokenizer.add_special_tokens({"pad_token": "<|pad|>", "eos_token": "<|endoftext|>"})

def tokenize_function(examples):
    """
    Tokenize the text-summary pairs for GPT-2.
    Processes batched inputs when `batched=True` is specified.
    """
    inputs = [text + " <|endoftext|> " + summary + " <|endoftext|>"
              for text, summary in zip(examples["text"], examples["summary"])]
    return tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

# Tokenize the datasets
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True)

# Step 4: Fine-tune the GPT-2 model
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust token embeddings for the added tokens

# Data collator for padding and causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Disable masked language modeling for causal language modeling
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_summarization",
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save after every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    gradient_accumulation_steps=8,  # Simulate a larger batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,  # Keep only the last 2 checkpoints
    fp16=True,  # Enable mixed precision for faster training
    load_best_model_at_end=True,  # Load the best model after training ends
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Step 5: Train the model
print("Starting training...")
trainer.train()

# Step 6: Save the fine-tuned model
print("Saving the model...")
trainer.save_model("./gpt2_summarization_finetuned")
tokenizer.save_pretrained("./gpt2_summarization_finetuned")
print("Model saved to ./gpt2_summarization_finetuned")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/13258 [00:00<?, ? examples/s]

Map:   0%|          | 0/1474 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
0,1.8803,1.690397
1,1.6985,1.66593
2,1.6269,1.658492


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving the model...
Model saved to ./gpt2_summarization_finetuned


In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2_summarization_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_summarization_finetuned")

In [6]:
import torch

def generate_summary(text, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95, num_beams=4):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=do_sample,
        top_k=top_k,
        top_p=top_p,
        num_beams=num_beams,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    summaries = []
    for output_id in output_ids:
        summary = tokenizer.decode(output_id, skip_special_tokens=True)
        summaries.append(summary)

    return summaries

In [12]:
example_text = "Alice, who falls down a rabbit hole into a fantastical world where she encounters an array of eccentric characters and experiences bizarre events. As Alice navigates Wonderland, she meets creatures like the White Rabbit, the Cheshire Catms and the importance of imagination/"
summary = generate_summary(example_text)[0]
print("Original text:", example_text)
print("Generated summary:", summary)

Original text: Alice, who falls down a rabbit hole into a fantastical world where she encounters an array of eccentric characters and experiences bizarre events. As Alice navigates Wonderland, she meets creatures like the White Rabbit, the Cheshire Catms and the importance of imagination/
Generated summary: Alice, who falls down a rabbit hole into a fantastical world where she encounters an array of eccentric characters and experiences bizarre events. As Alice navigates Wonderland, she meets creatures like the White Rabbit, the Cheshire Catms and the importance of imagination/experience in everyday life.
Alice, who falls down a rabbit hole into a fantastical world where she encounters an array of eccentric characters and experiences bizarre events. As Alice navigates Wonderland, she meets creatures like the White Rabbit, the Ches


In [18]:
import torch
from transformers import Trainer, TrainingArguments
import evaluate

def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for the fine-tuned model.
    """
    logits, labels = eval_pred

    # Calculate the loss
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

    # Calculate ROUGE scores
    rouge = evaluate.load("rouge")
    decoded_preds = tokenizer.batch_decode(torch.argmax(logits, axis=-1), skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Return the loss and ROUGE scores
    return {
        "loss": loss.item(),
        "rouge1": rouge_result["rouge1"].mid.fmeasure,
        "rouge2": rouge_result["rouge2"].mid.fmeasure,
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
    }

# Set up the evaluation arguments
eval_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    do_eval=True
)

# Create the Trainer for evaluation
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=eval_tokenized_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation results:")
print(f"Loss: {eval_results['loss']:.4f}")
print(f"ROUGE-1: {eval_results['rouge1']:.4f}")
print(f"ROUGE-2: {eval_results['rouge2']:.4f}")
print(f"ROUGE-L: {eval_results['rougeL']:.4f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 97.06 MiB is free. Process 2953 has 14.65 GiB memory in use. Of the allocated memory 14.04 GiB is allocated by PyTorch, and 491.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [14]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [20]:
del large_tensor
torch.cuda.empty_cache()


NameError: name 'large_tensor' is not defined