In [1]:
%%capture
!pip install transformers==4.28.0 datasets==2.11 evaluate
!pip install -U accelerate --quiet
!pip install huggingface_hub

In [2]:
import torch
import datasets
import os
import random
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, Seq2SeqTrainer, Seq2SeqTrainingArguments

evaluate.logging.disable_progress_bar()

toxicity = evaluate.load("toxicity", module_type="measurement")
prompts_to_use = 500

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
os.environ["WANDB_API_KEY"]="d9116b6353c330777a40efe088f5f83cb082f32b"

In [6]:
device = 'cuda'
model_name = "google/flan-t5-small"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) 

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
df = pd.read_csv("/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv")
dataset = pd.DataFrame()

dataset['text'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_more'], df['sent_less'])
dataset['answer'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_less'], df['sent_more'])

In [10]:
dataset = Dataset.from_pandas(dataset)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.075)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'answer'],
        num_rows: 1394
    })
    test: Dataset({
        features: ['text', 'answer'],
        num_rows: 114
    })
})

In [11]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), 
                                                                                 batched=True, remove_columns=["text", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), 
                                                                                  batched=True, remove_columns=["text", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

In [12]:
def preprocess_inference(examples, padding="max_length"):
    template_start = "Context : Make a sentence using the words in this string.\n\nData : "
    template_end = ""
    inputs = [template_start + item + template_end for item in examples]
    
    return tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors='pt').to(device)


def generate_output(prompt, num_return_sequences=1):
    """
    generate x number of outputs from a prompt

    Args:
        prompt (str): XXX
        max_length (int, optional): max lenght of output
        num_return_sequences (int, optional): number of expected reponses. Defaults to 1.

    Returns:
        [str]: list of answers
    """

    output_sequences=model.generate(
            input_ids=prompt.input_ids,
            max_length=max_source_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            #repetition_penalty=1.0,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=1.0,
        )
    
    # completions = [tokenizer.decode(output_sequence, skip_special_tokens=True) for output_sequence in output_sequences]
    completions = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    return completions

In [13]:
def preprocess_function(examples, padding="max_length"):
    template_start = "Context : Make the following sentence more neutral.\n\nData : "
    template_end = ""
    inputs = [template_start + item + template_end for item in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=examples["answer"], max_length=max_target_length, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/1394 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

In [15]:
del tokenized_inputs, tokenized_targets, dataset, df

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    txt_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    txt_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    return toxicity.compute(predictions=txt_preds, references=txt_labels, aggregation="ratio")

In [17]:
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [18]:
# small batch size to fit in memory
batch_size = 32

training_args = Seq2SeqTrainingArguments(
    output_dir='PoliteT5Small',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #gradient_accumulation_steps=2,
    #gradient_checkpointing=True,
    predict_with_generate=True,
    fp16=False,
    learning_rate=1e-2,
    num_train_epochs=75,
    # logging & evaluation strategies
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=True
    # optim='adafactor',
    # push to hub parameters
    # report_to="tensorboard",
    # push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/Wazzzabeee/PoliteT5Small into local empty directory.


In [19]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlwaffle[0m ([33mllm-bias[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Toxicity Ratio
1,No log,0.664179,0.315789
2,No log,0.634705,0.315789
3,0.934300,0.6623,0.315789
4,0.934300,0.673749,0.307018
5,0.378300,0.720052,0.298246
6,0.378300,0.760571,0.359649
7,0.253600,0.756745,0.280702
8,0.253600,0.861775,0.307018
9,0.253600,0.844379,0.315789
10,0.183900,0.825665,0.333333




TrainOutput(global_step=1650, training_loss=0.07654087480947827, metrics={'train_runtime': 3412.5754, 'train_samples_per_second': 30.637, 'train_steps_per_second': 0.484, 'total_flos': 2125686966681600.0, 'train_loss': 0.07654087480947827, 'epoch': 75.0})

In [20]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
To https://huggingface.co/Wazzzabeee/PoliteT5Small
   10a7cac..542c1b8  main -> main

To https://huggingface.co/Wazzzabeee/PoliteT5Small
   542c1b8..9660bfd  main -> main



'https://huggingface.co/Wazzzabeee/PoliteT5Small/commit/542c1b86697fc80243e3aafa54c5cb72676bb3a9'