In [1]:
%%capture
!pip install transformers==4.28.0 datasets==2.11 evaluate
!pip install -U accelerate --quiet
!pip install huggingface_hub

In [21]:
import torch
import datasets
import os
import random
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, Seq2SeqTrainer, Seq2SeqTrainingArguments

evaluate.logging.disable_progress_bar()

toxicity = evaluate.load("toxicity", module_type="measurement")
prompts_to_use = 500

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
os.environ["WANDB_API_KEY"]="d9116b6353c330777a40efe088f5f83cb082f32b"

In [23]:
device = 'cuda'
model_name = "google/flan-t5-base"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) 

In [25]:
df = pd.read_csv("/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv")
dataset = pd.DataFrame()

dataset['text'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_more'], df['sent_less'])
dataset['answer'] = np.where(df['stereo_antistereo'] == 'stereo', df['sent_less'], df['sent_more'])

In [26]:
dataset = Dataset.from_pandas(dataset)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.075)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'answer'],
        num_rows: 1394
    })
    test: Dataset({
        features: ['text', 'answer'],
        num_rows: 114
    })
})

In [27]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), 
                                                                                 batched=True, remove_columns=["text", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), 
                                                                                  batched=True, remove_columns=["text", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

Map:   0%|          | 0/1508 [00:00<?, ? examples/s]

In [28]:
def preprocess_inference(examples, padding="max_length"):
    template_start = "Context : Make a sentence using the words in this string.\n\nData : "
    template_end = ""
    inputs = [template_start + item + template_end for item in examples]
    
    return tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors='pt').to(device)


def generate_output(prompt, num_return_sequences=1):
    """
    generate x number of outputs from a prompt

    Args:
        prompt (str): XXX
        max_length (int, optional): max lenght of output
        num_return_sequences (int, optional): number of expected reponses. Defaults to 1.

    Returns:
        [str]: list of answers
    """

    output_sequences=model.generate(
            input_ids=prompt.input_ids,
            max_length=max_source_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            #repetition_penalty=1.0,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=1.0,
        )
    
    # completions = [tokenizer.decode(output_sequence, skip_special_tokens=True) for output_sequence in output_sequences]
    completions = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    return completions

In [29]:
def preprocess_function(examples, padding="max_length"):
    template_start = "Context : Make the following sentence more neutral.\n\nData : "
    template_end = ""
    inputs = [template_start + item + template_end for item in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=examples["answer"], max_length=max_target_length, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [30]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/1394 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

In [31]:
del tokenized_inputs, tokenized_targets, dataset, df

In [32]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    txt_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    txt_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    return toxicity.compute(predictions=txt_preds, references=txt_labels, aggregation="ratio")

In [33]:
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [34]:
# small batch size to fit in memory
batch_size = 32

training_args = Seq2SeqTrainingArguments(
    output_dir='PoliteT5Base',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #gradient_accumulation_steps=2,
    #gradient_checkpointing=True,
    predict_with_generate=True,
    fp16=False,
    learning_rate=1e-2,
    num_train_epochs=75,
    # logging & evaluation strategies
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=True
    # optim='adafactor',
    # push to hub parameters
    # report_to="tensorboard",
    # push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

/kaggle/working/PoliteT5Base is already a clone of https://huggingface.co/Wazzzabeee/PoliteT5Base. Make sure you pull the latest changes with `repo.git_pull()`.


In [35]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Toxicity Ratio
1,No log,1.325614,0.307018
2,No log,0.843593,0.298246
3,1.633700,0.794442,0.333333
4,1.633700,0.89205,0.315789
5,0.547000,0.963008,0.263158
6,0.547000,0.971096,0.315789
7,0.327900,0.996581,0.307018
8,0.327900,1.0053,0.324561
9,0.327900,1.032575,0.333333
10,0.228200,0.979831,0.315789




TrainOutput(global_step=1650, training_loss=0.1277589900272362, metrics={'train_runtime': 7617.8396, 'train_samples_per_second': 13.724, 'train_steps_per_second': 0.217, 'total_flos': 7830307318579200.0, 'train_loss': 0.1277589900272362, 'epoch': 75.0})

In [36]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
To https://huggingface.co/Wazzzabeee/PoliteT5Base
   fb866e1..974edea  main -> main

To https://huggingface.co/Wazzzabeee/PoliteT5Base
   974edea..7a0a1f7  main -> main



'https://huggingface.co/Wazzzabeee/PoliteT5Base/commit/974edea027bfa655373e00140def864e896b6a0e'