# Step 0 - Premiminaries
* Install needed packages
* Setup HF ecosystem

In [None]:
# For any HF basic activities like loading models
# and tokenizers for running inference
# upgrade is a must for the newest Gemma model
!pip install --upgrade datasets
!pip install --upgrade transformers

# For doing efficient stuff - PEFT
!pip install --upgrade peft
!pip install --upgrade trl
!pip install bitsandbytes
!pip install accelerate

# for logging and visualizing training progress
!pip install tensorboard
# If creating a new dataset, useful for creating *.jsonl files
!pip install jsonlines

## Login to HF ecosystem or get the token from secrets

In [None]:
from huggingface_hub import notebook_login
from google.colab import userdata
hf_token = userdata.get('hftoken')
notebook_login()

# Step 1 - Run Inference on Pre-trained Model
* Load the model
* Load the tokenizer
* Visualize the model architecture
* Finally, query the model with a prompt and see the response

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "google/gemma-2b"
# model_name = "meta-llama/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                            #  torch_dtype=torch.float16
                                             )
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          # torch_dtype=torch.float16
                                          )
print(model)

In [None]:
input_text = "What should I do on a trip to Europe?"

input_ids = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**input_ids, max_length=128)
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Explain the process of photosynthesis in a way that a child could understand"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
print(input_ids)
outputs = model.generate(**input_ids, max_length=128)
print(tokenizer.decode(outputs[0]))

## Step 2 - Motivation for PEFT
- Try to finetune Gemma on Dolly dataset without LoRA
  * Load and visualize the dataset
  * Initiate the trainer
  * Start the training
  * Note that its impossible to fine-tune on a single 14GB GPU (T4) on colab

In [None]:
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"
dataset = load_dataset(dataset_name, split="train[0:1000]")

print(f"Instruction is: {dataset[0]['instruction']}")
print(f"Response is: {dataset[0]['response']}")
dataset

In [None]:
def formatting_prompts_func(example):
  output_texts = []
  for i in range(len(example['instruction'])):
      if example['category'][i] in ['open_qa', 'general_qa']:
        text = f"Instruction:\n{example['instruction']}\n\nResponse:\n{example['response']}"
        output_texts.append(text)
  return output_texts

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
)
print("Initialized trainer for training!")

trainer.train()

# Finetune using Parameter Efficient Finetuning (PEFT)
- Create your own dataset
  - Visualizing the dataset
  - Cleaning and Preprocessing
  - Uploading to HF hub
- Fine-tune with the dataset


## Do all the imports

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


## Create your own dataset

In [None]:
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"
dataset = load_dataset(dataset_name, split="train")

In [None]:
from collections import defaultdict

categries_count = defaultdict(int)
for __, data in enumerate(dataset):
    categries_count[data['category']] += 1
print(categries_count)

In [None]:
# filter out those that do not have any context
filtered_dataset = []
for __, data in enumerate(dataset):
    if data["context"]:
        continue
    else:
        text = f"Instruction:\n{data['instruction']}\n\nResponse:\n{data['response']}"
        filtered_dataset.append({"text": text})

print(filtered_dataset[0:2])

In [None]:
# convert to json and save the filtered dataset as jsonl file
import jsonlines as jl
with jl.open('dolly-mini-train.jsonl', 'w') as writer:
    writer.write_all(filtered_dataset[0:])


In [None]:
from datasets import load_dataset

dataset_name = "ai-bites/databricks-mini"
dataset = load_dataset(dataset_name, split="train[0:1000]")
dataset

## Define all the parameters
- LoRA parameters
- bitsandbytes parameters
- training arguments / parameters
- Supervised fine-tuning (SFT) parameters

In [None]:
# define some variables - model names
model_name = "google/gemma-2b"
new_model = "gemma-ft"

################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
# lora_r = 64
lora_r = 4
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 25
# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 40 # None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True # False
# Load the entire model on the GPU 0
# device_map = {"": 0}
device_map="auto"


In [None]:
# Load QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, # Activates 4-bit precision loading
    bnb_4bit_quant_type=bnb_4bit_quant_type, # nf4
    bnb_4bit_compute_dtype=compute_dtype, # float16
    bnb_4bit_use_double_quant=use_nested_quant, # False
)

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        bf16 = True
    else:
        bf16 = False


## Load the Model and Tokenizer

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=hf_token,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
)
training_arguments

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    # formatting_func=format_prompts_fn,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

In [None]:
# Train model
trainer.train()
trainer.model.save_pretrained(new_model)

## Visualize training on Tensorboard

In [None]:
# !pip install tensorboard
%load_ext tensorboard
%tensorboard --logdir results/runs

# Prompt the newly fine-tuned model
* Load and MERGE the LoRA weights with the model weights
* Run inference with the same prompt we used to test the pre-trained model

In [None]:
input_text = "What should I do on a trip to Europe?"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
print(input_ids)
outputs = model.generate(**input_ids, max_length=128)
print(tokenizer.decode(outputs[0]))