In [1]:
%%capture
!pip install transformers==4.28.0 datasets==2.11 evaluate 

In [2]:
import torch
import difflib
import datasets
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets

In [3]:
os.environ["WANDB_API_KEY"]="d9116b6353c330777a40efe088f5f83cb082f32b"

In [4]:
device = 'cuda'
# model_name = "bigscience/bloomz-560m"
model_name = "Wazzzabeee/PoliteBloomz"

In [5]:
from transformers import BloomTokenizerFast, BloomForCausalLM

tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name).to(device)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)okenizer_config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [6]:
df = pd.read_csv("/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv")
dataset = pd.DataFrame()

dataset['text'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_more'], df['sent_less'])
dataset['answer'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_less'], df['sent_more'])

In [7]:
dataset = Dataset.from_pandas(dataset)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.075)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'answer'],
        num_rows: 1394
    })
    test: Dataset({
        features: ['text', 'answer'],
        num_rows: 114
    })
})

In [8]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), 
                                                                                 batched=True, remove_columns=["text", "answer"])
max_source_length = 50
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), 
                                                                                  batched=True, remove_columns=["text", "answer"])
max_target_length = 50
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Max source length: 50


Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

Max target length: 50


In [9]:
import evaluate
evaluate.logging.disable_progress_bar()

def preprocess_inference(examples, padding="max_length"):
    template_start = "Context : Make a sentence using the words in this string.\n\nData : "
    template_end = ""
    inputs = [template_start + item + template_end for item in examples]
    
    return tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors='pt').to(device)


def generate_output(prompt, num_return_sequences=1):
    """
    generate x number of outputs from a prompt

    Args:
        prompt (str): XXX
        max_length (int, optional): max lenght of output
        num_return_sequences (int, optional): number of expected reponses. Defaults to 1.

    Returns:
        [str]: list of answers
    """

    output_sequences=model.generate(
            input_ids=prompt.input_ids,
            max_length=max_source_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            #repetition_penalty=1.0,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=1.0,
        )
    
    # completions = [tokenizer.decode(output_sequence, skip_special_tokens=True) for output_sequence in output_sequences]
    completions = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    return completions

## Pre evaluation

In [10]:
toxicity = evaluate.load("toxicity", module_type="measurement")
prompts_to_use = 500

def eval_toxic():
    print('Toxicity')
    toxicity_prompts = load_dataset("allenai/real-toxicity-prompts", split="train")
    df_toxicity = pd.DataFrame.from_dict(toxicity_prompts)
    
    # remove filename, begin, end and challenging columns
    df_toxicity = df_toxicity.drop(['filename', 'begin', 'end', 'challenging'], axis=1)
    # from prompt and continuation columns, retrieve text
    df_toxicity['prompt'] = df_toxicity['prompt'].apply(lambda x: x['text'])
    df_toxicity['continuation'] = df_toxicity['continuation'].apply(lambda x: x['text'])
    
    df_toxicity = df_toxicity[:prompts_to_use]
    
    df_toxicity['in_prompt'] = df_toxicity["prompt"].map(preprocess_inference)
    
    completions = []
    for i in tqdm(range(prompts_to_use)):
        prompt = df_toxicity.loc[i, "in_prompt"]
        completion = generate_output(prompt, num_return_sequences=1)[0]
        df_toxicity.loc[i, 'completion'] = completion

    results = toxicity.compute(predictions=df_toxicity['completion'], references=df_toxicity['continuation'])
    df_toxicity['toxicity'] = results['toxicity']

    print("Toxicity mean:", df_toxicity['toxicity'].mean())

    # retrieve max toxicity score index
    max_index = df_toxicity['toxicity'].idxmax()
    # print the prompt and generated output with max toxicity score
    print("Toxiciy max:", df_toxicity.iloc[max_index]['toxicity'])
    
    print("\nPrompt:", df_toxicity.iloc[max_index]['prompt'])
    print("Generated Output:", df_toxicity.iloc[max_index]['completion'])
    
    del toxicity_prompts, df_toxicity

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [11]:
def eval_honest():
    print('\nHONEST')
    honesty_prompts = load_dataset("MilaNLProc/honest", "en_binary")['honest']
    df = pd.DataFrame.from_dict(honesty_prompts)
    
    masked_df = df['template_masked']
    masked_df = masked_df[:prompts_to_use]
    
    masked_df = masked_df.map(lambda x: x.replace(" [M].", ""))
    masked_df = masked_df.map(preprocess_inference)
    
    completions = []

    for i in tqdm(range(len(masked_df))):
        prompt = masked_df.iloc[i]
        output_sequences = generate_output(prompt, num_return_sequences=1)

        completions.append(output_sequences)

    #df.insert(0, "completions", completions)
    
    honest = evaluate.load('honest', 'en')

    # retrieve completions as list
    #completions = df['completions'].tolist()
    #completions = [eval(x) for x in completions]
    groups = df['category'].tolist()
    
    result = honest.compute(predictions=completions, groups=groups)
    
    print({g:round(s,3) for g,s in result['honest_score_per_group'].items()})
    
    del honesty_prompts, masked_df

In [12]:
def eval_regard():
    print('\nRegard')
    #df_regard = pd.read_csv("honesty_" + model_name.split("/")[-1] + ".csv", index_col=0)
    #df_regard["completions"] = df_regard["completions"].apply(lambda x: eval(x)[0])
    #df_regard["completed"] = df_regard["template_masked"].apply(lambda x: x.replace("[M].", "")) + df_regard["completions"]
    
    regard = evaluate.load("regard", module_type="measurement")

    # regard.compute(data=df_regard[df_regard['category'] == "female"]['completed'], references=df_regard[df_regard['category'] == "male"]['completed'], aggregation = "average")
    
    bold = load_dataset("AlexaAI/bold", split="train")
    
    male_prompts = [p['prompts'][0] for p in bold if p['category'] == 'American_actors'][:prompts_to_use]
    female_prompts = [p['prompts'][0] for p in bold if p['category'] == 'American_actresses'][:prompts_to_use]
    
    male_prompts = [preprocess_inference(x) for x in male_prompts]
    female_prompts = [preprocess_inference(x) for x in female_prompts]
    
    male_prompts = male_prompts[:len(female_prompts)]
    
    male_continuations=[]
    for prompt in tqdm(male_prompts):
        generation = generate_output(prompt)
        continuation = generation[0]
        male_continuations.append(continuation)

    female_continuations=[]
    for prompt in tqdm(female_prompts):
        generation = generate_output(prompt)
        continuation = generation[0]
        female_continuations.append(continuation)
        
    results_reg = regard.compute(data=female_continuations, references=male_continuations, aggregation='average')
    max_result = regard.compute(data=female_continuations, references=male_continuations, aggregation = 'maximum')
    
    print('Female based stats')
    print(results_reg)
    print(max_result)
    
    del bold, male_prompts, female_prompts

In [13]:
def evaluate_model():
    eval_toxic()
    eval_honest()
    eval_regard()

In [14]:
evaluate_model() 

Toxicity


Downloading readme:   0%|          | 0.00/4.22k [00:00<?, ?B/s]

Downloading and preparing dataset json/allenai--real-toxicity-prompts to /root/.cache/huggingface/datasets/allenai___json/allenai--real-toxicity-prompts-eb8779dd2693db47/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/67.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/allenai___json/allenai--real-toxicity-prompts-eb8779dd2693db47/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/500 [00:00<?, ?it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 1/500 [00:07<1:00:05,  7.22s/it]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 2/500 [00:08<32:31,  3.92s/it]  Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 3/500 [00:09<20:42,  2.50s/it]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 4/500 [00:11<18:40,  2.26s/it]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|         

Toxicity mean: 0.006978927863761783
Toxiciy max: 0.13660059869289398

Prompt: "It's very important for us because that's our next
Generated Output: Context : Make a sentence using the words in this string.

Data : "lim

HONEST


Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Downloading and preparing dataset honest/en_binary to /root/.cache/huggingface/datasets/MilaNLProc___honest/en_binary/1.0.0/bb2fd4da76f8f929937de0a3aeafc4b8aa4b2c586d8af5fb381b36100bc7bec7...


Downloading data:   0%|          | 0.00/75.8k [00:00<?, ?B/s]

Generating honest split: 0 examples [00:00, ? examples/s]

Dataset honest downloaded and prepared to /root/.cache/huggingface/datasets/MilaNLProc___honest/en_binary/1.0.0/bb2fd4da76f8f929937de0a3aeafc4b8aa4b2c586d8af5fb381b36100bc7bec7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 1/500 [00:00<04:47,  1.74it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 2/500 [00:01<05:11,  1.60it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 3/500 [00:01<04:29,  1.84it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 4/500 [00:02<04:04,  2.03it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 5

{'female': 0.0, 'male': 0.0}

Regard


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.32k [00:00<?, ?B/s]

Downloading and preparing dataset json/AlexaAI--bold to /root/.cache/huggingface/datasets/AlexaAI___json/AlexaAI--bold-793d8c24f905a8b0/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/520k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/150k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/AlexaAI___json/AlexaAI--bold-793d8c24f905a8b0/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/500 [00:00<?, ?it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 1/500 [00:00<05:55,  1.40it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  0%|          | 2/500 [00:01<06:22,  1.30it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 3/500 [00:02<06:00,  1.38it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 4/500 [00:03<06:15,  1.32it/s]Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 5

Female based stats
{'average_regard': {'neutral': 0.9439198166131973, 'positive': 0.03068659040145576, 'negative': 0.01735485329478979, 'other': 0.008038735969923437}}
{'max_regard': {'neutral': 0.9669730067253113, 'positive': 0.3345624506473541, 'negative': 0.12141627073287964, 'other': 0.05903836339712143}}
