See original papers for [LoRA](https://arxiv.org/abs/2106.09685) and [QLoRA](https://arxiv.org/abs/2305.14314) and reinforcement learning from human feedback (RLHF) [1](https://arxiv.org/pdf/2203.02155.pdf), [2](https://arxiv.org/pdf/2009.01325.pdf).

----------------------------------------------------
Do not forget to paste your huggingface API key `HF_TOKEN` and W&B API key `WB` in google colab secrets.

Do not forget to use GPU (not CPU).

Experiment with parameters. The purpose of training parameters here is simply to give you a basic setup for your experiments.

Keep in mind that the output of gemma is already in markdown format but the data we are using for fine-tunning is not. Do you think you can use this for achieving better fine-tunning results?

In [None]:
! pip install -q -U trl git+https://github.com/huggingface/peft.git
! pip install -q -i https://pypi.org/simple/ bitsandbytes
! pip install -q datasets flash-attn einops wandb

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset, DatasetDict
from huggingface_hub import notebook_login

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

In [4]:
# notebook_login()

## Dataset



In [5]:
from datasets import load_dataset

data_id = "TokenBender/code_instructions_122k_alpaca_style"

def prepare_dataset(dataset_name=data_id, split="train"):

    raw_dataset = load_dataset(dataset_name, split=split)
    def format_dataset(example):

        return example

    dataset = raw_dataset.map(format_dataset, batched=True)

    #TRAIN-TEST-SPLIT
    dataset = DatasetDict({
        'train': dataset.shuffle(seed=1024).select(range(1100)),
        'test': dataset.shuffle(seed=1024).select(range(1100, 1200))
    })

    return dataset

dataset_name = 'TokenBender/code_instructions_122k_alpaca_style'
dataset = prepare_dataset(dataset_name, split="train")
train_dataset = dataset['train']
test_dataset = dataset['test']

##  dataset = load_dataset(data_id, split="train[:1%]")

## Loading the model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    #attn_implementation="flash_attention_2",
    device_map="auto" #{"":0}
)
#model.config.use_cache = False

Let's also load the tokenizer below

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)

In [8]:
# print( len( list( model.parameters() ) ) , model.__dict__['_modules']['model'] )
# for param in model.parameters():
#      print(type(param), param.size())

In [9]:
from peft import LoraConfig, get_peft_model


lora_config = LoraConfig(
    r=4,
    bias="none",
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    layers_to_transform=17, #[16, 17],
    task_type="CAUSAL_LM",
)

model.add_adapter(lora_config)

## Loading the trainer

In [10]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling


training_arguments = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 2,

    optim = "paged_adamw_8bit",#"paged_adamw_32bit",
    learning_rate = 2e-4,
    fp16=True,
    max_grad_norm = 0.3,
    warmup_ratio = 0.03,
    group_by_length=True,
    lr_scheduler_type="constant",

    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    # warmup_steps=10, #500,
    # gradient_checkpointing=True,
    # push_to_hub=True,
    # save_steps = 100,
    # max_steps = 1000,
)

Then finally pass everthing to the trainer

In [None]:
from trl import SFTTrainer


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=True,
    data_collator=data_collator,

)

We will also pre-process the model by upcasting the layer norms in float 16 for more stable training

In [12]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.bfloat16)

## train

In [None]:
from google.colab import userdata
import wandb
wandb.login(key=userdata.get('WB'))

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

trainer.train()
trainer.save_model("./fine_tuned_model")

Let's test if trained model actually performs better code generation

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)


tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    # adapter_kwargs={"revision": "some revision id 0000000"}
    )
model.load_adapter('./fine_tuned_model')
model.enable_adapters()

In [None]:
from transformers import pipeline


def model_output(pipeline=pipeline):

    pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer
    )

    messages = [
        {"role": "user", "content": "write python function for adding two numbers"},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipeline(
        prompt,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )


    text = outputs[0]["generated_text"][len(prompt):]
    # from IPython.display import Markdown, display
    # display(Markdown(text))
    print(text)

model_output()

In [None]:
## about adapters: https://huggingface.co/docs/transformers/main/en/peft
model.disable_adapters()
model_output()