See original papers for [LoRA](https://arxiv.org/abs/2106.09685) and [QLoRA](https://arxiv.org/abs/2305.14314).

Do not forget to paste your huggingface API key `HF_TOKEN` and W&B API key `WB` in google colab secrets.

Do not forget to use GPU (not CPU). 

Experiment with parameters. The purpose of training parameters here is simply to test if the notebook would run.

In [None]:
! pip install -q -U trl git+https://github.com/huggingface/peft.git
! pip install -q -i https://pypi.org/simple/ bitsandbytes
! pip install -q datasets flash-attn einops wandb

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset, DatasetDict
from huggingface_hub import notebook_login

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

# notebook_login()

## Dataset



In [None]:
from datasets import load_dataset

def prepare_dataset(dataset_name="TokenBender/code_instructions_122k_alpaca_style", split="train"):

    raw_dataset = load_dataset(dataset_name, split=split)
    def format_dataset(example):

        return example

    dataset = raw_dataset.map(format_dataset, batched=True)

    #TRAIN-TEST-SPLIT
    dataset = DatasetDict({
        'train': dataset.shuffle(seed=42).select(range(1000)),
        'test': dataset.shuffle(seed=42).select(range(1000, 1200))
    })

    return dataset

dataset_name = 'TokenBender/code_instructions_122k_alpaca_style'
dataset = prepare_dataset(dataset_name, split="train")
train_dataset = dataset['train']
test_dataset = dataset['test']

## Loading the model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    #attn_implementation="flash_attention_2",
    device_map={"":0}
)
#model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)

In [None]:
from peft import LoraConfig, get_peft_model


peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=4,
    bias="none",
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    #layers_to_transform=[16, 17], 
)

## Loading the trainer

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling


training_arguments = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 2,
    #gradient_checkpointing=True,
    optim = "paged_adamw_32bit",
    save_steps = 100,
    learning_rate = 2e-4,
    fp16=True,
    max_grad_norm = 0.3,
    max_steps = 100,
    warmup_ratio = 0.03,
    group_by_length=True,
    lr_scheduler_type="constant",

    num_train_epochs=3,
    warmup_steps=10, #500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
from trl import SFTTrainer


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=True,
    args=training_arguments,
    data_collator=data_collator,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.bfloat16)

## train

In [None]:
from google.colab import userdata
import wandb
wandb.login(key=userdata.get('WB'))

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

trainer.train()