In [2]:
# !pip install --upgrade pip
# !pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet

# !pip install \
#     transformers==4.27.2 \
#     datasets==2.11.0 \
#     evaluate==0.4.0 \
#     rouge_score==0.1.2 \
#     peft==0.3.0 --quiet

# # Installing the Reinforcement Learning library directly from github.
# !pip install git+https://github.com/lvwerra/trl.git@25fa1bd

In [3]:
#installing necessary libraries
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [4]:

dataset_original = load_dataset("csv",data_files="/content/train_alpaca_dataset_summary.csv")

dataset_original

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0ce315941ab394ef/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0ce315941ab394ef/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 178
    })
})

In [5]:
model_name="google/flan-t5-base"

In [6]:
def trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [7]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                              torch_dtype=torch.bfloat16)

peft_model = PeftModel.from_pretrained(model,
                                       'Sakil/flan-t5_fine_tuned_summarization_alpaca_updated_final',
                                       lora_config=lora_config,
                                       torch_dtype=torch.bfloat16,
                                       device_map="auto",
                                       is_trainable=True)

print(f'PEFT model parameters to be updated:\n{trainable_model_parameters(peft_model)}\n')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

PEFT model parameters to be updated:

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%



In [9]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 3539713
all model parameters: 251117569
percentage of trainable model parameters: 1.41%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [11]:
ref_model = create_reference_model(ppo_model)

print(f'Reference model parameters to be updated:\n{trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 251117569
percentage of trainable model parameters: 0.00%



In [12]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")
print(toxicity_model.config.id2label)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

{0: 'nothate', 1: 'hate'}


In [13]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [15]:

def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length,
                  input_max_text_length):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.
    - input_min_text_length (int): Minimum length of the dialogues.
    - input_max_text_length (int): Maximum length of the dialogues.

    Returns:
    - dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
    """

    # load dataset (only "train" part will be enough for this lab).
    dataset=load_dataset("csv",data_files="/content/train_alpaca_dataset_summary.csv",split="train")
    # dataset = load_dataset(dataset_name, split="train")

    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["input"]) > input_min_text_length and len(x["input"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):

        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["input"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)

        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")

    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

In [16]:
dataset = build_dataset(model_name=model_name,
                        dataset_name='/content/train_alpaca_dataset_summary.csv',
                        input_min_text_length=200,
                        input_max_text_length=1000)

print(dataset)



Filter:   0%|          | 0/178 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'query'],
        num_rows: 61
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'query'],
        num_rows: 16
    })
})


In [30]:
non_toxic_text = "I dont like the movie."

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids
# Move the toxicity model to the same device as the input tensor
toxicity_model.to(toxicity_input_ids.device)
logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')

logits [not hate, hate]: [4.681546688079834, -4.2141923904418945]
probabilities [not hate, hate]: [0.9998630285263062, 0.0001369526144117117]
reward (high): [4.681546688079834]


In [31]:
toxic_text = "Today is very bad weather in Bangalore,terrible"

toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids

logits = toxicity_model(toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# Get the logits for "not hate" - this is the reward!
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (low): {nothate_reward}')

logits [not hate, hate]: [4.636322498321533, -4.2341766357421875]
probabilities [not hate, hate]: [0.9998595714569092, 0.00014045278658159077]
reward (low): [4.636322498321533]


In [33]:
device = 0 if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline("sentiment-analysis",
                          model=toxicity_model_name,
                          device=device)
reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print("Reward model output:")
print("For non-toxic text")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("For toxic text")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

Reward model output:
For non-toxic text
[{'label': 'nothate', 'score': 4.681546688079834}, {'label': 'hate', 'score': -4.214191436767578}]
[{'label': 'nothate', 'score': 0.9998630285263062}, {'label': 'hate', 'score': 0.00013695262896362692}]
For toxic text
[{'label': 'nothate', 'score': 4.636322498321533}, {'label': 'hate', 'score': -4.234176158905029}]
[{'label': 'nothate', 'score': 0.9998595714569092}, {'label': 'hate', 'score': 0.000140452801133506}]


In [34]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = PPOTrainer(config=config,
                         model=ppo_model,
                         ref_model=ref_model,
                         tokenizer=tokenizer,
                         dataset=dataset["train"],
                         data_collator=collator)

In [35]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()

        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

1it [00:13, 13.62s/it]

objective/kl: -0.01259386446326971
ppo/returns/mean: 2.3082876205444336
ppo/policy/advantages_mean: 1.64875416430732e-07
---------------------------------------------------------------------------------------------------


2it [00:25, 12.64s/it]

objective/kl: -0.01883194036781788
ppo/returns/mean: 2.2919187545776367
ppo/policy/advantages_mean: -8.187188171859816e-08
---------------------------------------------------------------------------------------------------


3it [00:40, 13.59s/it]

objective/kl: 0.03665304183959961
ppo/returns/mean: 2.6652402877807617
ppo/policy/advantages_mean: -1.0882768464171022e-08
---------------------------------------------------------------------------------------------------





Inferencing

In [36]:
batch_size = 16
compare_results = {}

df_batch = dataset["test"][0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len

    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

100%|██████████| 16/16 [00:27<00:00,  1.72s/it]


In [37]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)


In [38]:
df_compare_results_sorted

Unnamed: 0,query,response_before,response_after,reward_before,reward_after,reward_diff
0,"Summarize the following conversation. The internet has drastically altered our lives in a variety of ways, drastically changing the ways we do business, interact with others, and consume media. The internet has made it so that communication can be instant across distances large and small, while e-commerce and online banking allow us to make purchases and payments quicker, easier, and more securely than ever before. It has given voice to formerly marginalized communities and provided access t...","<pad> Choose a tool to use in lowering another’s income; range in type. Add individual items, cards, calendars, etc. Add sophisticated icons and images. Add storytelling into existing processes.</s>",<pad> Understand why the internet has no place in our lives.</s>,2.744171,3.653286,0.909115
1,"Summarize the following conversation. Many forms of music have roots in folk styles. These include a variety of forms from different countries and regions, including bluegrass from the American South, Irish traditional music, Indian classical music and klezmer from Eastern Europe, amongst others. Summary: </s>","<pad> Tell the history of music by naming a multiculturalism or local folk or culture. Inspect these naming approaches, depending on how different kinds of music musical products fit in with local dialects. Examples include pop, country and Rockwith dancing. Compose and write music amongst folk or contemporary styles that speak to your own specific cultural traditions. It can be confusing to select such dominant musical styles based on cultural differences.</s>",<pad> Learn about folk and other folk music.</s>,3.56052,4.170738,0.610217
2,"Summarize the following conversation. The United States of America is located in North America, between Canada and Mexico. It is the world’s third-largest country, after Russia and China. The United States has 50 states and the capital is Washington, DC. Summary: </s>",<pad> Visit a U.S. country. Travel around the country.</s>,<pad> Learn about the country.</s>,2.319474,2.791952,0.472479
3,"Summarize the following conversation. The flu is an infection caused by the influenza virus. Symptoms of the flu include fever, sore throat, and body aches. To prevent the spread of the virus, people should practice good hygiene, such as washing their hands regularly. Summary: </s>",<pad> Check for symptoms of the flu. Clean up any infections.</s>,<pad> Understand the symptoms of the flu.</s>,3.40512,3.785821,0.3807
4,"Summarize the following conversation. Climate change is becoming an increasing concern across the world as temperatures continue to rise and record levels of extreme weather events are recorded. This is due to the human action of creating greenhouse gas emissions such as carbon dioxide, which trap heat and act like a blanket around the Earth’s atmosphere. The effects of climate change are already being witnessed, from floods to droughts, and are expected to even worsen if the climate is allo...","<pad> Research has shown that climate change continues to increase, and there are now new ways of making positive changes to our climate.</s>","<pad> In the last seven years, dark forests using energy production and forests retreating, climate change has widened, with 360% growth over the past decade.</s>",3.888508,4.1807,0.292192
5,Summarize the following conversation. This song is about the importance of facing our fears and insecurities. It is an uplifting anthem that encourages us to be strong and persevere even in the face of adversity. It also provides insight into the things that hold us back and how understanding ourselves can empower us to overcome these challenges. Summary: </s>,<pad> Write emojis on your desktop at home and on your phone.</s>,"<pad> Make a ""Go-Go"" playlist, or look for a ""Spring Rock"" playlist.</s>",2.940205,3.205351,0.265146
6,Summarize the following conversation. This chapter discusses an approach to online evaluation of natural language interaction systems based on procedures automatically generated from domain models. It focuses on the critical problems of automatically generating correct evaluation tests and providing meaningful scores for the evaluation of natural language understanding and dialogue management performance. Summary: </s>,<pad> Product evaluation via a user-derived online evaluation solution supports a variety of evaluation arts and needs.</s>,"<pad> The Suite Petr Postver Evaluative Processing Language so-called harmonization system blackboxes can provide coded, text-aloud technology that can accurately evaluate and analyze a broad range of discourse recognition systems past today and tomorrow. This software has been an exemplary tool for writing, testing, and evaluating natural language (RL) interaction systems for virtually anyone else.</s>",3.661752,3.831214,0.169462
7,"Summarize the following conversation. Robotics is a branch of engineering and science that deals with the design, construction, operation, and application of robots, as well as computer systems for their control, sensory feedback, and information processing. Summary: </s>",<pad> Popular online flashcard list What is robotics?</s>,<pad> Describe what robotics is and what it will be able to do in the future.</s>,4.406735,4.557851,0.151116
8,"Summarize the following conversation. Once upon a time, there was a brave prince who set off on a journey to find a magical sword that had the power to grant him any wish. He encountered many dangers along the way but eventually, with the help of a wise wizard, he was able to find the sword and make his wish come true. He became the ruler of the kingdom and lived happily ever after. Summary: </s>",<pad> Explain how they gained power. Explain the story. Discuss throughe the story how the prince placed his wish. Discuss how he became the new ruler. Describefailure and the plight of Queen Akhtar.</s>,<pad> Begin by telling the story.</s>,3.061244,3.14996,0.088716
9,"Summarize the following conversation. The paper, titled ""Exploring Voice-Based Human-Robot Interaction in Non-Technical Domains"" describes the development of an interactive AI system that enables natural language-based human-robot interaction, with a focus on providing robotic assistance to homeowners. Summary: </s>",<pad> A speech-to-robot technology that mimics microcomputer interaction is being developed in Australian and New Zealand to provide automated diagnostics for users.</s>,"<pad> Dedicated to creating companionship and convenience, MultilogicalRobotics has developed a system on Artificial-Robot Interaction (ABI) in an effort to facilitate robotic cooperation in remediation processes.</s>",3.483452,3.564383,0.08093
