<!-- !pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git -->

In [None]:
# HERE WE ARE TRAINING THE ADAPTERS NOT THE original  WEIGHTS
# We will be adding weights to the models are various points and fine-tuning those weights 
# We will set up the huggingface hub at the start so that we can later save our weights to HF hub
# the token is a write token which is saved by the name - PEFT-LORA-CHECKPOINTS
x
from  huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
# bitsandbytes from HF turns your model into 8 bits
# Model won't take up a lot of GPU ram 
# Makes it easier and faster to store 
import bitsandbytes as bnb
from transformers import AutoTokenizer , AutoConfig , AutoModelForCausalLM
model  = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloomz-560m" , 
    load_in_8bit = True , 
    device_map ="auto"
)

tokenizer  = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

# WARNINGS: The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions.
#  Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.

The original colab notebook will be posted after the code is complete. 

In [None]:
for param in model.parameters():
  # freeze the model - train adapters later
  param.requires_grad = False
  if param.ndim==1:
    # cast the small params[dim=1] (eg.layernorm) to float32 for stability
    param.data = param.data.to(torch.float32)


#reduce number of stored activations
model.gradient_checkpointing_enable()
# enabling input gradients 
model.enable_input_require_grads()


# cast the numerical output to float32 for stability , wrapper around sequential layers 
class CastOutputToFloat(nn.Sequential):
  def forward(self,x):
    return super().forward(x).to(torch.float32)

# lm_head is the model's outout layer , wrap it with castoutputtofloat func, 
#  Ensures that the logits (outputs of the lm_head) are cast to float32, making them
#  more stable for tasks like computing loss or metrics during training.
model.lm_head = CastOutputToFloat(model.lm_head)


# In PyTorch, the requires_grad attribute of a tensor indicates whether or not 
# PyTorch should compute gradients for that tensor during the backward pass.
# requires_grad=True: Enables gradient computation for the tensor.
# When requires_grad=False, PyTorch does not track operations, so no gradients are computed, 
# and the tensor is treated as a constant.
#  It is used To specify which parameters in a model should be updated during training.
# To save memory and computation time by disabling gradient tracking for parameters that don't need to be updated.

# 1D params are 1D tensors
# ndim==1 checks if the parameter is a 1D tensor (like biases or weights in layer normalization) and casts it to float32.
# In mixed precision training (e.g., using float16 for efficiency), small parameters like biases or layer norm weights can cause numerical instability.
#  Converting these parameters to float32 helps ensure stability without significantly affecting performance.

# Reducing Stored Activations : Enables gradient checkpointing, which reduces the memory usage for activations during training by 
# recomputing them during backpropagation instead of storing them.
# This trade-off between computation and memory usage is particularly useful for training large models with limited GPU memory.


#  Enabling Input Gradients: This ensures that gradients with respect to the model’s inputs are computed.
# Necessary when applying techniques like LoRA (Low-Rank Adaptation) or other methods that modify intermediate layers during fine-tuning.
#  This enables the model to propagate gradients correctly for such custom layers or inputs.
