In [22]:
!pip install transformers -q
!pip install accelerate -q
!pip install bitsandbytes -q
!pip install peft -q
!pip install datasets -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## Inspiration

https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/peft-flan-t5-int8-summarization.ipynb

In [117]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from datasets import load_dataset, load_from_disk
import numpy as np
from datasets import Dataset

## Create Model

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

## Preprocess the data

Data is already processed, go to the step below, where the data is loaded

In [131]:
dataset_source = load_dataset("OpenAssistant/oasst1", split='train')
dataset_english = dataset_source.filter(lambda x: x["lang"] == "en")

Found cached dataset parquet (/home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-27097617d71ba8b7.arrow


In [132]:
dataset_english_prompts = dataset_english.filter(lambda x: x["role"] == "prompter")
dataset_english_assistants = dataset_english.filter(lambda x: x["role"] == "assistant")

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-08719d9ce3792fc9.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8d2ead7d0aab61d2.arrow


In [133]:
print(len(dataset_english_prompts))
print(len(dataset_english_assistants))

15200
24083


In [134]:
df_prompt = dataset_english_prompts.to_pandas()
df_assistant = dataset_english_assistants.to_pandas()

In [146]:
df_prompt['text'][0]

'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.'

In [155]:
def find_prompt(value):
    try:
        return df_prompt[df_prompt['message_id'] == value]['text'].iloc[0]
    except:
        return ''

In [156]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_prompt)

In [157]:
df_assistant.columns

Index(['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role',
       'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic',
       'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis',
       'labels', 'prompt'],
      dtype='object')

In [None]:
df_assistant.drop(['message_id', 'parent_id', 'user_id', 'created_date', 'role',
       'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name',
       'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'], axis=1, inplace=True)

In [168]:
df_assistant = df_assistant.astype('string')

In [169]:
df_assistant.dtypes

text      string[python]
prompt    string[python]
dtype: object

In [170]:
dataset = Dataset.from_pandas(df_assistant)

In [171]:
dataset[0]

{'text': '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining power, leading to a situation where they are dependent on the employer for their livelihood. This dependence can result in further suppression of wages and a decline in working conditions.\n\nOverall, the concept of monops

In [172]:
# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = dataset.map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True)
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f"Max target length: {max_target_length}")

                                                                    

Max source length: 47


                                                                   

Max target length: 414


In [174]:
def preprocess_function(sample, padding="max_length"):
    inputs = sample["prompt"]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors="pt")

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample['text'], max_length=max_target_length, padding=padding, truncation=True, return_tensors="pt")

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    print(model_inputs)
    return model_inputs

In [176]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['prompt', 'text'])

Map:   0%|          | 0/24083 [00:00<?, ? examples/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Map:   4%|▍         | 1000/24083 [00:09<03:37, 106.27 examples/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Map:   8%|▊         | 2000/24083 [00:19<03:38, 100.91 examples/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`-

{'input_ids': tensor([[571, 557, 225,  ...,   0,   0,   0],
        [571, 557, 225,  ...,   0,   0,   0],
        [ 27, 174,   3,  ..., 351,  16,   1],
        ...,
        [363,  33,   8,  ...,   0,   0,   0],
        [363,  33,   8,  ...,   0,   0,   0],
        [363,  33,   8,  ...,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': [[tensor(11115), tensor(8), tensor(495), tensor(2675), tensor(13), tensor(3), tensor(9), tensor(1218), tensor(19), tensor(46), tensor(359), tensor(2453), tensor(2491), tensor(12), tensor(766), tensor(8), tensor(2757), tensor(11850), tensor(11), tensor(24473), tensor(13), tensor(8), tensor(358), tensor(5), tensor(37), tensor(7321), tensor(28), tensor(84), tensor(25), tensor(225), tensor(1349), tensor(39), tensor(495), tensor(2675), tensor(5619),

                                                                  

In [177]:
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [178]:
print(tokenized_dataset[0])

{'input_ids': [1072, 25, 1431, 3, 9, 710, 5302, 81, 8, 20208, 13, 8, 1657, 96, 2157, 9280, 106, 63, 121, 16, 1456, 7, 58, 863, 169, 4062, 1341, 12, 1055, 7414, 102, 739, 725, 16, 8, 12568, 512, 11, 3, 8464, 2193, 585, 5, 1, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'labels': [96, 9168, 9280, 106, 63, 121, 2401, 7, 12, 3, 9, 512, 1809, 213, 132, 19, 163, 80, 8001, 21, 3, 9, 1090, 207, 42, 313, 5, 86, 1456, 7, 6, 48, 1657, 19, 1989, 2193, 16, 8, 5347, 512, 6, 213, 3, 9, 7414, 102, 739, 63, 6152, 65, 1516, 579, 147, 8, 15488, 11, 464, 1124, 13, 70, 1652, 5, 37, 3053, 13, 3, 9, 7414, 102, 739, 63, 54, 741, 16, 1364, 15488, 11, 3915, 4311, 1645, 21, 2765, 6, 38, 8, 6152, 65, 385, 17821, 12, 993, 15488, 42, 370, 394, 464, 1124, 5, 17716, 585, 65, 4313, 1055, 7414, 102, 739, 725, 16, 5238, 224, 38, 3549, 11, 1006, 542, 6, 213, 3, 9, 360, 508, 688, 610, 3, 9, 1516, 4

In [179]:
# save datasets to disk for later easy loading
tokenized_dataset.save_to_disk("data/train")

                                                                                                 

Load the already computed dataset:

In [None]:
tokenized_dataset = load_from_disk("data/train")

In [12]:
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))

<pad> Wie alte sind Sie?</s>


Create LoRA Config and new PEFT-Model

In [180]:
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

In [183]:
model.print_trainable_parameters()

trainable params: 4718592 || all params: 787868672 || trainable%: 0.5989059049678777


## Training

In [184]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [185]:
output_dir="lora-flan-t5-large"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no"
)

In [186]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [187]:
# train model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
