<!-- !pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git -->

In [None]:
# HERE WE ARE TRAINING THE ADAPTERS NOT THE original  WEIGHTS
# We will be adding weights to the models are various points and fine-tuning those weights 
# We will set up the huggingface hub at the start so that we can later save our weights to HF hub
# the token is a write token which is saved by the name - PEFT-LORA-CHECKPOINTS
x
from  huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
# bitsandbytes from HF turns your model into 8 bits
# Model won't take up a lot of GPU ram 
# Makes it easier and faster to store 
import bitsandbytes as bnb
from transformers import AutoTokenizer , AutoConfig , AutoModelForCausalLM
model  = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloomz-560m" , 
    load_in_8bit = True , 
    device_map ="auto"
)

tokenizer  = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

# WARNINGS: The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions.
#  Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.

The original colab notebook will be posted after the code is complete. 

In [None]:
for param in model.parameters():
  # freeze the model - train adapters later
  param.requires_grad = False
  if param.ndim==1:
    # cast the small params[dim=1] (eg.layernorm) to float32 for stability
    param.data = param.data.to(torch.float32)


#reduce number of stored activations
model.gradient_checkpointing_enable()
# enabling input gradients 
model.enable_input_require_grads()


# cast the numerical output to float32 for stability , wrapper around sequential layers 
class CastOutputToFloat(nn.Sequential):
  def forward(self,x):
    return super().forward(x).to(torch.float32)

# lm_head is the model's outout layer , wrap it with castoutputtofloat func, 
#  Ensures that the logits (outputs of the lm_head) are cast to float32, making them
#  more stable for tasks like computing loss or metrics during training.
model.lm_head = CastOutputToFloat(model.lm_head)


# In PyTorch, the requires_grad attribute of a tensor indicates whether or not 
# PyTorch should compute gradients for that tensor during the backward pass.
# requires_grad=True: Enables gradient computation for the tensor.
# When requires_grad=False, PyTorch does not track operations, so no gradients are computed, 
# and the tensor is treated as a constant.
#  It is used To specify which parameters in a model should be updated during training.
# To save memory and computation time by disabling gradient tracking for parameters that don't need to be updated.

# 1D params are 1D tensors
# ndim==1 checks if the parameter is a 1D tensor (like biases or weights in layer normalization) and casts it to float32.
# In mixed precision training (e.g., using float16 for efficiency), small parameters like biases or layer norm weights can cause numerical instability.
#  Converting these parameters to float32 helps ensure stability without significantly affecting performance.

# Reducing Stored Activations : Enables gradient checkpointing, which reduces the memory usage for activations during training by 
# recomputing them during backpropagation instead of storing them.
# This trade-off between computation and memory usage is particularly useful for training large models with limited GPU memory.


#  Enabling Input Gradients: This ensures that gradients with respect to the model’s inputs are computed.
# Necessary when applying techniques like LoRA (Low-Rank Adaptation) or other methods that modify intermediate layers during fine-tuning.
#  This enables the model to propagate gradients correctly for such custom layers or inputs.


SETTING UP LORA ADAPTERS:


In [None]:
# The purpose of the function is to calculate and print the number of trainable parameters versus the total number of parameters in the model.
# _: The name of the parameter is ignored in this loop since it is not needed.
# param: Represents the actual parameter tensor.
# numel stands for number of elements 
# For a tensor with a specific shape, .numel() computes the product of its dimensions to determine how many elements the tensor contains.

def print_trainable_parameters(model):
  trainable_params=0
  all_params = 0
  for _,param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
  # if we are setting requires_grad = True , that means we are computing backpropogation for those params, hence they will be included in training 

      trainable_params += param.numel()
    print(f"trainable params:{trainable_params} || all_params:{all_params} || trainable% :{100*trainable_params/all_params}")


'''
Why Use .numel()? : Returns total number of params in a tensor
Count Parameters: When iterating over model parameters, .numel() helps determine the total number of elements in each parameter tensor.
Memory Estimation: Knowing the number of elements allows you to estimate the memory required for storing the tensor.
Debugging: Helps ensure the expected number of elements in a tensor, especially when dealing with reshaping or slicing operations.


LoRA: Fine-tunes only a small subset of parameters (low-rank adapters) while freezing most of the original model parameters.
This function helps verify that the model is correctly set up for LoRA by ensuring only the intended parameters are trainable.
During LoRA setup, you'd expect the trainable% to be very small (e.g., <1%), as most parameters remain frozen.
'''

In [None]:
from peft import LoraConfig , get_peft_model

config  = LoraConfig(
    r =16 ,# attention heads 
    lora_alpha = 32 ,   #alpha scaling
    # target_modules = ["q_proj","v_proj"]
    lora_dropout = 0.05 , 
    bias = "none",
    task_type = "CAUSAL_LM" # set this for CLM or Seq2seq
)
model = get_peft_model(model,config)
print_trainable_parameters(model)

# LoraConfig: Defines the configuration for LoRA fine-tuning, specifying the settings for how the low-rank adaptation is applied
# get_peft_model: A utility function to apply LoRA to an existing model, integrating the LoRA parameters into the model's architecture.
'''
r=16
Specifies the rank of the low-rank matrices used in LoRA.
In LoRA, instead of training the full parameter matrices of the model, a pair of low-rank matrices (rank r) is trained and added to the original weights.
Higher r values allow the model to learn more complex adaptations but require more compute and memory.

lora_alpha=32
A scaling factor applied to the LoRA updates.
Controls how much influence the LoRA layers have on the overall model's behavior.
Larger values of lora_alpha scale the LoRA updates more aggressively.

target_modules=["q_proj", "v_proj"] 
Specifies which modules in the model are targeted for LoRA adaptation.
For transformer-based models:
q_proj refers to the query projection in the attention mechanism.
v_proj refers to the value projection.
By focusing only on specific modules, LoRA reduces the number of trainable parameters and computation cost.

lora_dropout=0.05
Introduces a small dropout rate to the LoRA updates.
Helps improve generalization by randomly dropping a portion of the LoRA updates during training.

bias="none"
Specifies how the bias terms in the model are handled.
"none": No bias terms are added or trained.
"all": Bias terms are trainable.
"lora_only": Only biases corresponding to LoRA modules are trainable.


task_type="CAUSAL_LM"
Defines the type of task the model is being fine-tuned for.
"CAUSAL_LM": Indicates the task is causal language modeling (e.g., autoregressive text generation).
Other possible task types include "SEQ2SEQ_LM" (sequence-to-sequence tasks) or "TOKEN_CLASSIFICATION".
'''

DATA

In [None]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
# data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

In [None]:
data 

In [None]:
data["train"][:5]

In [None]:

# Give a quote by the user , predict its tag
def merge_columns(example):
  # tell the model : every time it sees (->)
  # condition on the input before that , and generate the tags after that 
  # 
  example["prediction"] = example["quote"] + "->:"+str(example["tags"])
  return example


data["train"] = data["train"].map(merge_columns)
data["train"]["prediction"][3:5]


In [None]:
data["train"][0]

In [None]:
data = data.map(lambda samples:tokenizer(samples["prediction"]),batched= True)

In [None]:
data
# extra columns : prediction , input_ids , attention_mask

TRAINING PROCESS

In [None]:
trainer = transformers.Trainer(
    model = model ,
    train_dataset = data["train"],
    args = transformers.TrainingArguments(
# training on 1 gpu 
        per_device_train_batch_size =4,
        gradient_accumulation_steps = 4 , 
    # we set a warmup to start with an extremely low learning rate and
    # build it up from there to 2e-4
        warmup_steps = 100,
        max_steps = 200 ,
        learning_rate = 2e-4 ,
        fp16 = True,
        logging_steps = 1,
        output_dir ="outputs"
    ) , 
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False)
)
model.config.use_cache = False
trainer.train()

In [None]:
 # Save the adapters on the HUB
model.push_to_hub("goffer/bloomz-560m-lora",
                   use_auth_token= True,
                   commit_message= "LORA Adapters",
                   private = True 
)