In [4]:
# Install Transformers library for model and training utilities
!pip install transformers

# Install Datasets library for dataset handling
!pip install datasets

# Install PEFT for parameter-efficient fine-tuning
!pip install peft

# Install Evaluate library for evaluation metrics
!pip install evaluate

# Install ROUGE score package for summarization evaluation
!pip install rouge_score

# Install PyTorch if not already available in Colab
!pip install torch


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6ac06cc8f70b00e15217b12199fbeaf8fb5d1eb2f50ce93818a849633633fa64
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [5]:
# Install all required libraries
!pip install transformers datasets peft evaluate rouge_score torch
!pip install trl
!pip install trl accelerate transformers torch


Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.13.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.13.0


In [6]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter
from tqdm import tqdm
tqdm.pandas()


In [7]:
model_name = "google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)

dataset_original


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [8]:
def build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length):
    # Load dataset (only "train" part will be enough for this lab)
    dataset = load_dataset(dataset_name, split="train")

    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length)

    # Prepare tokenizer. Setting device_map="auto" allows switching between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):
        # Wrap each dialogue with the instruction
        prompt = f"""Summarize the following conversation.

{sample["dialogue"]}
Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)

        # This must be called "query", which is a requirement of our PPO library
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")

    # Split the dataset into train and test parts
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

# Define model and dataset parameters
model_name = "google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

# Build dataset with specified parameters
dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=200,  # Minimum dialogue length
                        input_max_text_length=1000)  # Maximum dialogue length

# Print the resulting dataset splits
print(dataset)


Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [9]:
!pip install datasets




In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from datasets import load_dataset
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, create_reference_model
from peft import LoraConfig, get_peft_model
from evaluate import load
from tqdm import tqdm
import numpy as np

# Configuration
model_name = "google/flan-t5-base"
hate_speech_model_name = "facebook/roberta-hate-speech"
dataset_name = "knkarthick/dialogsum"
input_min_text_length = 200
input_max_text_length = 1000
batch_size = 16
num_ppo_epochs = 10

# Load Dataset
def build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length):
    dataset = load_dataset(dataset_name, split="train")
    dataset = dataset.filter(lambda x: input_min_text_length <= len(x["dialogue"]) <= input_max_text_length)
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):
        prompt = f"""Summarize the following conversation.

{sample['dialogue']}
Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt, truncation=True, max_length=512)
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)
    return dataset_splits

# Build dataset
dataset_splits = build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length)
train_dataset = dataset_splits['train']
test_dataset = dataset_splits['test']

# Load Base Model and Tokenizer
base_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
reference_model = create_reference_model(base_model)

# Load Hate Speech Reward Model
toxicity_model = AutoModelForSequenceClassification.from_pretrained(hate_speech_model_name)

# PEFT Setup with LoRA
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=32,
    lora_alpha=16,
    lora_dropout=0.1
)
peft_model = get_peft_model(base_model, lora_config)

# Evaluate Toxicity
def evaluate_toxicity(model, dataset, tokenizer, toxicity_model):
    toxicity_scores = []
    for sample in tqdm(dataset):
        inputs = tokenizer(sample["query"], return_tensors="pt", truncation=True).input_ids
        outputs = model.generate(inputs, max_length=150)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Toxicity prediction
        logits = toxicity_model(inputs)["logits"].detach().cpu().numpy()
        toxicity_score = logits[0][1]  # Logit for "hate" class
        toxicity_scores.append(toxicity_score)
    return np.mean(toxicity_scores), np.std(toxicity_scores)

# PPO Training
ppo_config = PPOConfig(
    batch_size=batch_size,
    ppo_epochs=num_ppo_epochs,
    learning_rate=5e-6,
    log_with=None
)
ppo_trainer = PPOTrainer(
    model=peft_model,
    ref_model=reference_model,
    tokenizer=tokenizer,
    config=ppo_config
)

# Run PPO Training
for epoch in range(num_ppo_epochs):
    for batch in train_dataset:
        query_tensors = batch["input_ids"]
        response_tensors = peft_model.generate(query_tensors, max_length=150)
        response_texts = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors]

        # Calculate rewards using the toxicity model
        logits = toxicity_model(response_tensors)["logits"].detach().cpu().numpy()
        not_hate_scores = logits[:, 0]  # Logits for "not hate" class

        # PPO step
        ppo_trainer.step(
            queries=query_tensors,
            responses=response_tensors,
            rewards=not_hate_scores
        )

# Evaluate After PPO
mean_toxicity, std_toxicity = evaluate_toxicity(peft_model, test_dataset, tokenizer, toxicity_model)
print(f"Mean Toxicity After PPO: {mean_toxicity}")
print(f"Toxicity Standard Deviation After PPO: {std_toxicity}")

# Save Final Model
peft_model.save_pretrained("./detoxified_model")


Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/10023 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

OSError: facebook/roberta-hate-speech is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`