# Finetuning a qlora using only the huggingface library
I would not recommend this as for most use cases using unsloth allows you to finetune using less memory and faster.
However if for some reason you can't or don't want to use unsloth this notebook shows how to finetune a model using only the huggingface library. 

In [None]:
%%capture
!pip install peft
!pip install -U flash-attn
!pip install bitsandbytes
!pip install trl

### Load base model

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from google.colab import userdata

def supports_flash_attention(device_id):
    """Check if a GPU supports FlashAttention."""
    major, minor = torch.cuda.get_device_capability(device_id)

    # Check if the GPU architecture is Ampere (SM 8.x) or newer (SM 9.0)
    is_sm8x = major == 8 and minor >= 0
    is_sm90 = major == 9 and minor == 0

    return is_sm8x or is_sm90

max_seq_length = 4000

# I can only finetune a small model on google colab if I am not using unsloth
hf_model = "facebook/opt-350m"

if supports_flash_attention(0):
  model = AutoModelForCausalLM.from_pretrained(hf_model, quantization_config=BitsAndBytesConfig(load_in_4bit=True), attn_implementation="flash_attention_2", token=userdata.get('HF_ACCESS_TOKEN'))
else:
  model = AutoModelForCausalLM.from_pretrained(hf_model, quantization_config=BitsAndBytesConfig(load_in_4bit=True), token=userdata.get('HF_ACCESS_TOKEN'))

tokenizer = AutoTokenizer.from_pretrained(hf_model)

### Add lora to base model

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# More info about parameters: https://huggingface.co/docs/peft/v0.11.0/en/package_reference/lora#peft.LoraConfig
lora_config = LoraConfig(
    r=16, # rank of lora matrices according to paper not much loss when set relatively low
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # On which modules of the llm the lora weights are used
                      #"embed_tokens", "lm_head",], # Add for continual pretraining (unsloth)
    lora_alpha = 16, # scales the weights of the adapters (more influence on base model), 16 was recommended on reddit
    use_rslora = True, # scales lora_alpha with 1/sqrt(r), huggingface says this works better
    task_type=TaskType.CAUSAL_LM, # task is predicting next tokens given previous tokens, unsloth does not set this
    lora_dropout = 0, # Default on 0.05 in tutorial but unsloth says 0 is better
    #use_dora = True, # apperantly better but introduces overhead so model would need to be merged for inference.
    inference_mode = False
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

### Load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("pookie3000/pg_essays_split_1000_t", split = "train")
EOS_TOKEN = tokenizer.eos_token

def formatting_func(example):
    return example["text"] + EOS_TOKEN

### Training

In [None]:
from trl import SFTConfig, SFTTrainer
import transformers

is_bfloat16_supported = transformers.utils.import_utils.is_torch_bf16_gpu_available()
if is_bfloat16_supported:
  print("bfloat16 supported")
else:
  print("bfloat not supported")

# https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer
# https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments (many arguments are defined in default trainer)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    args = SFTConfig(
        num_train_epochs = 1,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # helpful to simluate large batches when memory can't fit one
        warmup_steps = 5, # small learning rate in the beginning leads to smoother training
        output_dir="/outputs",
        max_seq_length = max_seq_length,
        dataset_text_field = "text", # automatically creates ConstantLengthDataset based on this dataset_text_filder,
        fp16 = not is_bfloat16_supported, # trains on fp16 instead of fp32 which is more efficient
        bf16 = is_bfloat16_supported, # more efficient then fp16
        optim = "adamw_8bit", # adamw in general recommended and 8bit for lower memory consumption
        learning_rate = 5e-4,
        weight_decay = 0.01, # seems necesary to prevent overfitting,
        logging_steps = 1, # log after every step the loss
        save_strategy = "no", # set to steps or epoch(save is done after each epoch)
        lr_scheduler_type = "linear",  # can set to other values but this seems the best
    )
)

In [None]:
trainer.train()

## Inference

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

inputs = tokenizer(
[
    "Once upon a time, in a galaxy, far far away,"
]*1, return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 256,
    use_cache = True,
)
thread = Thread(target = model.generate, kwargs = generation_kwargs)
thread.start()

length = 0
for j, new_text in enumerate(text_streamer):
    if j == 0:
        wrapped_text = textwrap.wrap(new_text, width = max_print_width)
        length = len(wrapped_text[-1])
        wrapped_text = "\n".join(wrapped_text)
        print(wrapped_text, end = "")
    else:
        length += len(new_text)
        if length >= max_print_width:
            length = 0
            print()
        print(new_text, end = "")
    pass
pass

## Save model

### Save lora weights to hub

In [None]:
model.push_to_hub("Thufail/opt-350m-lora", token = userdata.get('HF_ACCESS_TOKEN'))

### Model Merging

Merge lora weights to base model

In [None]:
# https://huggingface.co/docs/peft/v0.7.1/en/package_reference/lora#peft.LoraModel.merge_and_unload
# https://huggingface.co/docs/peft/en/developer_guides/lora
merged_model = model.merge_and_unload()

In [None]:
merged_model.push_to_hub("Thufail/opt-350m-lora-merged", userdata.get('HF_ACCESS_TOKEN'))