In [None]:
!pip install accelerate==0.26.1 bitsandbytes==0.42.0 datasets==2.16.1 peft==0.8.1 >> /dev/null
!pip install transformers==4.37.2 einops==0.7.0 torch==2.1.0 >> /dev/null

In [6]:
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk

dataset = load_dataset("Hypersniper/riddles_v1", split='all')
final_dataset = load_dataset("g-ronimo/riddles_evolved", split='all')

Downloading readme:   0%|          | 0.00/938 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1682 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map='auto',
    torch_dtype=torch.bfloat16)

In [None]:
tokenizer = registerkenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
import random

questions=[ q for q in dataset["instruction"] ]

prompt_template=""""Below are 10 riddles. Come up with 10 more. 
Output just the riddles, no numbering. Don't output anything else.

Riddles:
{questions}"""

synthetic_riddles = []

# Ask Mistral 300 times = 3,000 new riddles
for _ in range(300):
    # Pick 10 random questions to include in prompt
    random.shuffle(questions)
    q10_sample = questions[0:10]
    
    # Put 10 questions into prompt template = prompt
    prompt=prompt_template.format( questions="\n\n".join(q10_sample) )
    messages = [{"role": "user", "content": prompt}]
    
    # Apply Mistral chat format to prompt, tokenize, generate
    input_tokens = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    output_tokens = model.generate(
        input_tokens, 
        max_new_tokens = 500, 
        do_sample = True, 
        pad_token_id = tokenizer.eos_token_id)

    output_tokens = output_tokens[0][len(input_tokens[0]):] # Cut prompt from output
    output = tokenizer.decode(output_tokens, skip_special_tokens=True)

    synthetic_riddles.extend( output.split("\n") )

In [None]:
synthetic_riddles=[s.strip() for s in synthetic_riddles if len(s.strip())>0]
len(synthetic_riddles)

In [None]:
# synthetic_riddles
prefixes= \
    [f"{num}." for num in range(50)] + \
    [f"{num})" for num in range(50)] + \
    ["I.",
    "II.",
    "III.",
    "IV.",
    "V.",
    "VI.",
    "VII.",
    "VIII.",
    "IX.",
    "X."]

synthetic_riddles_clean=[]

for r in tqdm(synthetic_riddles):
    r_clean=r
    if r[-1]!="." and r[-1]!="?":
        continue
    if r[-1]==",":
        continue
    if r[-1]==";":
        continue
    for prefix in prefixes:
        if r.startswith(prefix):
             r_clean=r.split(prefix)[1].strip()
    synthetic_riddles_clean.append(r_clean) 

display(len(synthetic_riddles_clean))

# remove duplicates
synthetic_riddles_clean=list(set(synthetic_riddles_clean))
display(len(synthetic_riddles_clean))

# remove almost duplicates where the first 20 chars are the same
synthetic_riddles_clean.sort()

tmp=[]
chars=40
for i in range(1,len(synthetic_riddles_clean)):
    if len(synthetic_riddles_clean[i])<chars and len(synthetic_riddles_clean[i-1])<chars:
        tmp.append(synthetic_riddles_clean[i])
        continue
    if synthetic_riddles_clean[i][:chars]==synthetic_riddles_clean[i-1][:chars]:
        continue
    else:
        tmp.append(synthetic_riddles_clean[i])
synthetic_riddles_clean=tmp

display(len(synthetic_riddles_clean))

In [None]:
import copy 

prompt_template = """"{riddle}

Think step-by-step, keep your explanations simple, try your very best. 
If there is information missing for you to come up with a specific 
answer, just ask me a short question at the very end of your answer."""

# copy the dict with the synthetic riddles to a new one which will contain the answers 
synthetic_riddles_step2 = copy.deepcopy(synthetic_riddles_dict)

for riddle in synthetic_riddles_step2:
    # format prompt using the template, insert the riddle
    prompt = prompt_template.format( riddle=riddle["messages"][0] )

    # apply Mistal prompt format and tokenize
    messages = [{"role": "user", "content": prompt}]    
    input_tokens = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    # generate. 500 output tokens are enough for the average answer length of Mistral
    output_tokens = model.generate(
        input_tokens, 
        max_new_tokens = 500, 
        do_sample = True, 
        pad_token_id = tokenizer.eos_token_id)

    output_tokens = output_tokens[0][len(input_tokens[0]):]
    output = tokenizer.decode(output_tokens, skip_special_tokens=True)

    # append answer to each conversation
    riddle["messages"].append(output)

In [None]:
import copy 

prompt_template = """"Please continue the converstation below. Provide 
the next reply by the user. Formulate a very short question. 
Imitate a curious 10 year old kid asking a question.

user: {question}
assistant: {answer}"""

# copy the dict with the synthetic riddles to a new one which will contain the answers too
synthetic_riddles_step3 = copy.deepcopy(synthetic_riddles_step2)

for riddle in synthetic_riddles_step3:
    # format prompt using the template, insert the conversation we have so far
    prompt = prompt_template.format( 
        question = riddle["messages"][0],
        answer = riddle["messages"][1] 
    )
    messages = [ {"role": "user", "content": prompt} ]
    
    input_tokens = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    output_tokens = model.generate(
        input_tokens, 
        max_new_tokens = 500, 
        do_sample = True, 
        pad_token_id = tokenizer.eos_token_id)
    output_tokens = output_tokens[0][len(input_tokens[0]):]
    output = tokenizer.decode(output_tokens, skip_special_tokens = True)

    riddle["messages"].append(output)

In [None]:
synthetic_riddles_step4 = copy.deepcopy(synthetic_riddles_step3)

for riddle in tqdm(synthetic_riddles_step4):

    # this time no prompt, just apply the Mistral chat template 
    # to the three messages we generated so far 
    messages = [
        {"role": "user", "content": riddle["messages"][0]},
        {"role": "assistant", "content": riddle["messages"][1]},
        {"role": "user", "content": riddle["messages"][2]},
    ]
    

    input_tokens = tokenizer.apply_chat_template(messages, return_tensors = "pt").to("cuda")
    output_tokens = model.generate(
        input_tokens, 
        max_new_tokens = 500, 
        do_sample = True, 
        pad_token_id = tokenizer.eos_token_id)
    output_tokens = output_tokens[0][len(input_tokens[0]):]

    output = tokenizer.decode(output_tokens, skip_special_tokens = True)

    riddle["messages"].append(output)

In [11]:
# some more datasets
philo_quotes = load_dataset("mertbozkurt/quotes_philosophers")


Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [12]:
motivational = load_dataset("leonweber/teaching_motivational_quotes")


Downloading data:   0%|          | 0.00/7.54M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
funny_quote= load_dataset("Khalida1w/funny_quotes")

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/979k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
reddit_joke = load_dataset("SocialGrep/one-million-reddit-jokes")

Downloading readme:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/300M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [9]:
final_dataset[0]

# there are 4 elements in the messages, user / assistant alternating

{'number': 0,
 'messages': ['A bag contains apples, some red, some green. If you reach in without looking, and pick one apple, what is the chance that it was ripe?',
  'The information given does not specify anything about the ripeness of the apples being red or green. Therefore, we cannot determine the chance that a picked apple is ripe based on its color alone. Additionally, ripeness is a subjective factor that can vary from apple to apple even within the same color group. So,we would need more information, such as the definition of ripeness used or any context about the storage conditions of the apples, to provide an answer.',
  "But how can we tell if it's more likely to be a ripe apple if it's red, since sometimes red apples aren't ripe yet?",
  'You are correct that the color of an apple (being red) does not necessarily indicate that it is ripe. Some apple varieties, such as Granny Smith apples, remain green even when fully ripe. Therefore, the color of an apple alone cannot be u

In [27]:
del model
torch.cuda.empty_cache()

In [28]:
# Working on fine-tuning

import torch  
from transformers import AutoModelForCausalLM, BitsAndBytesConfig  
  
# Load model
modelpath = "microsoft/phi-2"

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
    # FA2 does not work yet
    # attn_implementation="flash_attention_2",          
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
from transformers import AutoTokenizer

# fast tokenizer sometimes ignores the added tokens  
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)      
  
# add special tokens for ChatML formatting and a pad token  
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [30]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model  
# model ready for training  
model = prepare_model_for_kbit_training(model,
                                        use_gradient_checkpointing=True) 

# Adapter settings
lora_config = LoraConfig(
    r=32, 
    lora_alpha=32, 
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense",],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

**target_modules :** With the settings above we train ~9.2% (283 million) of the model parameters only. We could also train all linear layers — in the case of Phi-2 that would be the layers ["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"] which increases the number of trainable parameters to 10.0% (309 M) for the given rank.
🛠 Training all linear layers should increase the models performance since it’s closer to a full fine-tune, but also requires more VRAM and increases size of the checkpoints.


**rank:** The rank in Low-Rank Adapters (LoRA) also influences the number of trainable parameters. A higher rank increases the size of the update matrices, this means more trained parameters and greater model flexibility, but at the cost of increased computational complexity. Conversely, a lower rank results in fewer parameters, leading to more efficient training and less computational burden, but potentially less flexibility in adapting the model. Thus, the choice of rank represents a trade-off between model adaptability and computational resources required for training.
🛠 Increasing the **rank from 32 to 64**, for example, increases the number of trainable parameters to 9.8% (304 million) for the given target_modules.

**lora_alpha:** This is a scaling factor that adjusts the influence of the low-rank updates on the original weights of the model. It modulates how much the original behaviour of the model is altered. The LoRA paper states that “tuning alpha is roughly the **same as tuning the learning rate”.**

🛠 There is no consensus on how to set lora_alpha in relation to rank (reddit, Farty Pants on medium, Platypus paper, Ahead of AI blog). One approach seems to be setting **lora_alpha = rank** which is what we use here.

**lora_dropout:** Dropout-rate during the training process. A value of 0.1 means that 10% of the trainable parameters are randomly set to non-trainable (or "dropped"), this should help the model generalize and prevent overfitting. 5% and 10% are common values, it does not matter which one you pick in my limited experience with this parameter.

In [31]:
from datasets import load_dataset

# load the dataset created in Part 1
dataset = load_dataset("g-ronimo/riddles_evolved")

# split into training (90%) and test set (10%)
dataset = dataset["train"].train_test_split(test_size=0.1)

In [33]:
import os
from functools import partial

# ChatML format
templates = [
    "<|im_start|>assistant\n{msg}<|im_end|>",      # message by assistant
    "<|im_start|>user\n{msg}<|im_end|>"           # message by user
]

# This special index is used to ignore certain tokens during loss calculation.
IGNORE_INDEX = -100

def tokenize(input, max_length):
    input_ids, attention_mask, labels = [], [], []

    # Iterate over each message in the dataset
    for i, msg in enumerate(input["messages"]):

        # Check if the message is from human (user) or assistant, apply ChatML template
        isHuman = i%2==0
        msg_chatml = templates[isHuman].format(msg=msg)

        # tokenize all, truncate later
        msg_tokenized = tokenizer(
          msg_chatml, 
          truncation=False, 
          add_special_tokens=False)

        # Copy tokens and attention mask without changes
        input_ids += msg_tokenized["input_ids"]
        attention_mask += msg_tokenized["attention_mask"]

        # Adapt labels for loss calculation: if user->IGNORE_INDEX, if assistant->input_ids  (=ignore human messages, calculate loss only for assistant messages since these are the reponses we want to learn)
        labels += [IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]

    # truncate to max. length
    return {
        "input_ids": input_ids[:max_length], 
        "attention_mask": attention_mask[:max_length],
        "labels": labels[:max_length],
    }

dataset_tokenized = dataset.map(
    # cut samples at 1024 tokens
    # enough for the riddles dataset (max. length 1000 tokens)
    # has to be adapted for other datasets, higher=more VRAM needed
    partial(tokenize, max_length=1024), 
    batched = False,
    num_proc = os.cpu_count(),    # multithreaded
    remove_columns = dataset["train"].column_names  # Remove original columns, no longer needed
)

Map (num_proc=4):   0%|          | 0/1513 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/169 [00:00<?, ? examples/s]

The **purpose of the collate function** is to process and prepare batches of data for training (and evaluation). It standardizes the length of each data point in the batch by padding to the length of the longest sample using specific tokens. The input_ids are padded with the pad token, the labels with the IGNORE_INDEX (to indicate that these tokens shouldn't contribute to the loss calculation), and the attention_mask with 0 (to ignore the padded tokens).

In [34]:
# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to a single dictionary forming a batch { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):

    # Extract input_ids from each element and find the maximum length among them
    tokens = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokens])

    for i, sample in enumerate(elements):
        input_ids = sample["input_ids"]
        labels = sample["labels"]
        attention_mask = sample["attention_mask"]

        # Calculate the padding length required to match the maximum token length
        pad_len = tokens_maxlen-len(input_ids)

        # Pad 'input_ids' with the pad token ID, 'labels' with IGNORE_INDEX, and 'attention_mask' with 0
        input_ids.extend(pad_len * [tokenizer.pad_token_id])
        labels.extend(pad_len * [IGNORE_INDEX])
        attention_mask.extend(pad_len * [0])

    # create and return batch with all the data in elements
    batch={
        "input_ids": torch.tensor([e["input_ids"] for e in elements]),
        "labels": torch.tensor([e["labels"] for e in elements]),
        "attention_mask": torch.tensor([e["attention_mask"] for e in elements]),
    }
    return batch

In [38]:
from transformers import TrainingArguments, Trainer

bs=1         # batch size
ga_steps=16  # gradient acc. steps

epochs=20
lr=0.00002

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,      # eval twice per epoch
    save_steps=steps_per_epoch,         # save once per epoch
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go NaN with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    bf16=False, 
    ddp_find_unused_parameters=False,
    report_to="none"
)
# Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
)

trainer.train()



Step,Training Loss,Validation Loss


**batch_size:** Larger batch sizes are preferable but constrained by available VRAM. The longer the training samples (increasing max_length during tokenization), the more VRAM is needed.
🛠 ️In this specific case of samples with a max_length of 1024 tokens, a batch_size of 1 is the maximum on a 24GB VRAM GPU. Gradient checkpointing is a feature that saves VRAM and was recently made available for Phi-2. Higher batch sizes than at the time of writing are now possible. To increase the effective batch size, gradient_accumulation_steps was set to 16 which has the downside of slowing down the training process.

**learning_rate:** Selected empirically. As I will try to convince you below, a rate of 2e-5 (0.00002) has shown effective results for this dataset.
️🛠 A learning rate of 4e-5also “works” and results in a finetuned model that responds in line with the training data. Which learning is better, what is the best setting? This depends on the size and kind of training data. You would simply have to try and see how the model behaves.
Note on the topic of measuring the performance of a fine-tuned model: LLM evaluation is hard, see below for my thoughts on benchmarks.

**lr_scheduler_type:** Following the recommendation of the QLoRA author Tim Dettmers for using a constant learning rate schedule, I’ve adopted this approach and found it consistently effective for Phi-2, and also Llama 1/2 and Mistral.

**bf16:** For mixed precision training, we utilize bfloat16 (bf16), which consumes less memory compared to 32-bit formats and provides a broader dynamic range than fp16. Using fp16 previously led to Not a Number (NaN) errors when working with Phi-2. However, bf16 demands an NVIDIA Ampere (or newer) GPU.

**epochs:** 20 epochs is an unusually high number. We use such a high number because our dataset is tiny. A more detailed explanation follows below.

Training for 20 epochs might seem excessive. For comparison, a dataset with around **12k conversations typically requires only 3 epochs.** 

Applying this logic to our riddles dataset: 1 epoch = 1680 conversations, our target was to train on approximately 36k conversations in total, which translates to around 21 epochs.