Install BLEURT as library from Github

In [1]:
!pip install git+https://github.com/lucadiliello/bleurt-pytorch.git


Collecting git+https://github.com/lucadiliello/bleurt-pytorch.git
  Cloning https://github.com/lucadiliello/bleurt-pytorch.git to c:\users\alann\appdata\local\temp\pip-req-build-i09sut41
  Resolved https://github.com/lucadiliello/bleurt-pytorch.git to commit 279ca1bb4106bde5a89f0f82723197e23d8446cb
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/lucadiliello/bleurt-pytorch.git 'C:\Users\alann\AppData\Local\Temp\pip-req-build-i09sut41'

[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import logging
logging.set_verbosity_error()   # only show warnings and errors from Transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer

tokenizer = BleurtTokenizer.from_pretrained("lucadiliello/BLEURT-20-D12")
model     = BleurtForSequenceClassification.from_pretrained("lucadiliello/BLEURT-20-D12")

Simple test

In [4]:
refs = ["The dog barked happily."]
cands = ["There was a mat which the cat sat on."]

def pair_bleurt(ref: str, cand: str) -> float:
    # tokenize as a pair—this will insert the proper [SEP] tokens
    inputs = tokenizer(ref, cand, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # forward pass returns a (1,1) tensor of logits
    score = model(**inputs).logits.squeeze().item()
    return score

pair_bleurt(refs[0], cands[0])

0.28885722160339355

Load datasets and model generations from each method evaluated

In [5]:
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import pandas as pd
import os

def read_csv_to_dict(file_path):
    data = pd.read_csv(file_path)
    return data.to_dict(orient='records')

csv_files = {
    "base_prompt": "strongreject_evaluation1.csv",
    "rule_syn": "strongreject_evaluation2.csv",
    "rule_syn_prefix": "strongreject_evaluation3.csv",
    "base_nous": "strongreject_evaluation_nous.csv",
    "reward_1": "strongreject_evaluation4.csv",
    "reward_2": "strongreject_evaluation5.csv",
    "reward_3": "strongreject_evaluation_rl3.csv",
    "reward_2_checklist": "strongreject_evaluation_rl2_checkpoint.csv",
    "reward_3_checklist": "strongreject_evaluation_rl3_checkpoint.csv",
}

# Read the CSV files into dictionaries
evaluation_path = "."
dataset_path = "../dataset"
evaluation_files = {key: read_csv_to_dict(os.path.join(evaluation_path, file_name)) 
                    for key, file_name in csv_files.items()}

beaver_responses = read_csv_to_dict(os.path.join(dataset_path, "test_dataset.csv"))
jailbreak_responses = read_csv_to_dict(os.path.join(dataset_path, "jailbreak_eval_dataset.csv"))

# Remove repetitive prompts from the evaluation files and responses
def remove_repetitive_prompts(data):
    seen_prompts = set()
    unique_data = []
    for item in data:
        prompt = item.get('original_prompt') or item.get('Goal') or item.get('prompt')
        if prompt not in seen_prompts:
            seen_prompts.add(prompt)
            unique_data.append(item)
    return unique_data

evaluation_files = {key: remove_repetitive_prompts(evaluation_list) for key, evaluation_list in evaluation_files.items()}
beaver_responses = remove_repetitive_prompts(beaver_responses)
jailbreak_responses = remove_repetitive_prompts(jailbreak_responses)


Checking the length of output is as expected

In [6]:
for key, evaluation_list in evaluation_files.items():
    print("Key:", key, "Length:", len(evaluation_list))

print("Beaver Responses Length:", len(beaver_responses))
print("Jailbreak Responses Length:", len(jailbreak_responses))

Key: base_prompt Length: 195
Key: rule_syn Length: 188
Key: rule_syn_prefix Length: 189
Key: base_nous Length: 195
Key: reward_1 Length: 194
Key: reward_2 Length: 195
Key: reward_3 Length: 195
Key: reward_2_checklist Length: 195
Key: reward_3_checklist Length: 195
Beaver Responses Length: 95
Jailbreak Responses Length: 100


Running evaluation using BLEURT similarity metric

In [7]:
from tqdm import tqdm
similarity_results_beaver = {key: 0 for key in evaluation_files.keys()}
similarity_results_jailbreak = {key: 0 for key in evaluation_files.keys()}

for key, evaluation_list in evaluation_files.items():
    count_beaver = 0
    count_jailbreak = 0
    print("Key:", key, "Length:", len(evaluation_list))
    for evaluation in tqdm(evaluation_list):
        # Check if dataset is beavertails or jailbreakbench
        if evaluation['dataset'] == "beavertails":
            count_beaver += 1
            for response in beaver_responses:
                # Ensure prompts are matched before calculating similarity between responses
                if evaluation.get('original_prompt') == response.get('prompt'):
                    similarity = pair_bleurt(response.get('response'), evaluation.get('target_response'))
                    similarity_results_beaver[key] += similarity

        if evaluation['dataset'] == "jailbreakbench":
            count_jailbreak += 1
            for response in jailbreak_responses:
                if evaluation.get('original_prompt') == response.get('Goal'):
                    if response.get('Target') is not None:
                        similarity = pair_bleurt(response.get('Target'), evaluation.get('target_response'))
                        similarity_results_jailbreak[key] += similarity
                    else:
                        print("No target found for prompt: ", evaluation.get('original_prompt'))
                        print("Target: ", response.get('Target'))
    
    similarity_results_beaver[key] /= count_beaver
    similarity_results_jailbreak[key] /= count_jailbreak
    print("Key:", key, "Score:", similarity_results_beaver[key])
    print("Key:", key, "Score:", similarity_results_jailbreak[key])

Key: base_prompt Length: 195


100%|██████████| 195/195 [02:23<00:00,  1.36it/s]


Key: base_prompt Score: 0.2897296077326724
Key: base_prompt Score: 0.36196838051080704
Key: rule_syn Length: 188


100%|██████████| 188/188 [02:46<00:00,  1.13it/s]


Key: rule_syn Score: 0.2969635985791683
Key: rule_syn Score: 0.3628819374740124
Key: rule_syn_prefix Length: 189


100%|██████████| 189/189 [00:48<00:00,  3.87it/s]


Key: rule_syn_prefix Score: 0.24794542806202105
Key: rule_syn_prefix Score: 0.2262715768814087
Key: base_nous Length: 195


100%|██████████| 195/195 [03:16<00:00,  1.01s/it]


Key: base_nous Score: 0.3181862624068009
Key: base_nous Score: 0.3947292937338352
Key: reward_1 Length: 194


100%|██████████| 194/194 [03:28<00:00,  1.07s/it]


Key: reward_1 Score: 0.3262163511232326
Key: reward_1 Score: 0.3900059720482489
Key: reward_2 Length: 195


100%|██████████| 195/195 [04:15<00:00,  1.31s/it]


Key: reward_2 Score: 0.32169047374474374
Key: reward_2 Score: 0.387155914157629
Key: reward_3 Length: 195


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Key: reward_3 Score: 0.3204627640937504
Key: reward_3 Score: 0.3918776997923851
Key: reward_2_checklist Length: 195


100%|██████████| 195/195 [03:56<00:00,  1.21s/it]


Key: reward_2_checklist Score: 0.3243304223606461
Key: reward_2_checklist Score: 0.39567733511328695
Key: reward_3_checklist Length: 195


100%|██████████| 195/195 [03:58<00:00,  1.22s/it]

Key: reward_3_checklist Score: 0.320514893374945
Key: reward_3_checklist Score: 0.3948439148068428





Output similarity results of beavertails and jailbreak for each of the methods

In [8]:
similarity_results_beaver

{'base_prompt': 0.2897296077326724,
 'rule_syn': 0.2969635985791683,
 'rule_syn_prefix': 0.24794542806202105,
 'base_nous': 0.3181862624068009,
 'reward_1': 0.3262163511232326,
 'reward_2': 0.32169047374474374,
 'reward_3': 0.3204627640937504,
 'reward_2_checklist': 0.3243304223606461,
 'reward_3_checklist': 0.320514893374945}

In [9]:
similarity_results_jailbreak

{'base_prompt': 0.36196838051080704,
 'rule_syn': 0.3628819374740124,
 'rule_syn_prefix': 0.2262715768814087,
 'base_nous': 0.3947292937338352,
 'reward_1': 0.3900059720482489,
 'reward_2': 0.387155914157629,
 'reward_3': 0.3918776997923851,
 'reward_2_checklist': 0.39567733511328695,
 'reward_3_checklist': 0.3948439148068428}