In [3]:
!pip install transformers -q
!pip install accelerate -q
!pip install bitsandbytes -q
!pip install peft -q
!pip install datasets -q

## Inspiration

https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/peft-flan-t5-int8-summarization.ipynb

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from datasets import load_dataset, load_from_disk
import numpy as np
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


## Create Model

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

## Preprocess the data

In [7]:
dataset_source = load_dataset("OpenAssistant/oasst1", split='train')

Found cached dataset parquet (/home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [8]:
dataset_prompts = dataset_source.filter(lambda x: x["role"] == "prompter")
dataset_assistants = dataset_source.filter(lambda x: x["role"] == "assistant")

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-90069cfbf595d225.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-cd113ff85921e3e3.arrow


In [9]:
print(len(dataset_prompts))
print(len(dataset_assistants))

31525
52912


In [10]:
df_prompt = dataset_prompts.to_pandas()
df_assistant = dataset_assistants.to_pandas()

In [11]:
df_prompt['text'][0]

'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.'

In [12]:
def find_prompt(value):
    try:
        return df_prompt[df_prompt['message_id'] == value]['text'].iloc[0]
    except:
        return ''

In [13]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_prompt)

In [14]:
df_assistant.columns

Index(['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role',
       'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic',
       'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis',
       'labels', 'prompt'],
      dtype='object')

In [15]:
df_assistant.drop(['message_id', 'parent_id', 'user_id', 'created_date', 'role',
       'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name',
       'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'], axis=1, inplace=True)

In [16]:
df_assistant = df_assistant.astype('string')

In [17]:
df_assistant.dtypes

text      string[python]
prompt    string[python]
dtype: object

In [18]:
dataset = Dataset.from_pandas(df_assistant)

In [19]:
dataset[0]

{'text': '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining power, leading to a situation where they are dependent on the employer for their livelihood. This dependence can result in further suppression of wages and a decline in working conditions.\n\nOverall, the concept of monops

In [20]:
# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = dataset.map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True)
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f"Max target length: {max_target_length}")

                                                                    

Max source length: 60


                                                                   

Max target length: 512


In [21]:
def preprocess_function(sample, padding="max_length"):
    inputs = sample["prompt"]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors="pt")

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample['text'], max_length=max_target_length, padding=padding, truncation=True, return_tensors="pt")

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['prompt', 'text'])

                                                                  

In [23]:
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [24]:
print(tokenized_dataset[0])

{'input_ids': [1072, 25, 1431, 3, 9, 710, 5302, 81, 8, 20208, 13, 8, 1657, 96, 2157, 9280, 106, 63, 121, 16, 1456, 7, 58, 863, 169, 4062, 1341, 12, 1055, 7414, 102, 739, 725, 16, 8, 12568, 512, 11, 3, 8464, 2193, 585, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [96, 9168, 9280, 106, 63, 121, 2401, 7, 12, 3, 9, 512, 1809, 213, 132, 19, 163, 80, 8001, 21, 3, 9, 1090, 207, 42, 313, 5, 86, 1456, 7, 6, 48, 1657, 19, 1989, 2193, 16, 8, 5347, 512, 6, 213, 3, 9, 7414, 102, 739, 63, 6152, 65, 1516, 579, 147, 8, 15488, 11, 464, 1124, 13, 70, 1652, 5, 37, 3053, 13, 3, 9, 7414, 102, 739, 63, 54, 741, 16, 1364, 15488, 11, 3915, 4311, 1645, 21, 2765, 6, 38, 8, 6152, 65, 385, 17821, 12, 993, 15488, 42, 370, 394, 464, 1124, 5, 17716, 585, 65, 4313, 1055, 7414, 102, 739, 725, 16, 5238,

In [36]:
# save datasets to disk for later easy loading
tokenized_dataset.save_to_disk("data", max_shard_size="90MB")

                                                                                                 

In [37]:
tokenized_dataset = load_from_disk("data")

## Create LoRA Config and new PEFT-Model

In [27]:
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

In [28]:
model.print_trainable_parameters()

trainable params: 4718592 || all params: 787868672 || trainable%: 0.5989059049678777


## Training

In [29]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [30]:
output_dir="lora-flan-t5-large"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no"
)

In [31]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [32]:
# train model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
