<a href="https://colab.research.google.com/github/Rajeeb321123/Large-Language-model/blob/master/4_Memorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Packages

In [None]:
!pip -m pip install --upgrade pip -1
!pip install transformers == 4.38.1 -q -U
!pip install bitsandbytes == 0.42.0 -q -U
!pip install peft == 0.8.2 -q -U
!pip install accelerate == 0.27.2 -q -U
!pip install flash ==  -q -U
!pip install  datasets
!pip install  scipy
!pip install  trl
!pip install  hf_transfer
!pip install  huggingface_hub
!pip install  wanddb

In [None]:
!transformers-cli env

In [None]:
## Unsloth install

## Load Model

In [None]:
# For gated models on HuggingFace
from huggingface_hub import notebook_login
notebook_login()

In [None]:
%env HF_HUB_ENABLE_HF_TRANSFER = True # for high speed downloading and uploading to hugging face hub

In [None]:
cache_dir = '' # comment out if Google Drive is aset as cache_dir

# base model (Unsupervised Trial)
model_id = "openchat/openchat_3.5"

In [None]:
## Load the model and Tokenizer of LoRA or DoRA
from Transformers import AutoTokenizer, AutoModelForCausalllm, BitsAndBytes
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # if newer gpu: bfloat16
)

In [None]:
# config = AutoConfig.from_pretrained(model_id)
# cofig.max_position_embeddings = 4096 # (input + output) #model will only learn from max 4096 sequence of token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #config=config,

    #quantization_config=bnb_config,

    #rope_scaling={"type":linear, "factor": 2.0}, # roPE scaling: https://www.hopsworks.ai/dictionary/rope-scaling and https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/preparing_model

    #https://huggingface.co/docs/accelerate/v0.25.0/en/concept_guides/big_model_inference
    device_map='auto', # It’s fully possible to create your own device map for the layers to use as well, specifying the GPU device to use (a number), "cpu", or "disk" and pass this in:

    # Here, the "trust_remote_code=True" means "download the model code from huggingface repo 'internlm/internlm-chat-7b'", along with the weight, and run it. If it's False, the library would use builtin model architectures hardcoded in huggingface/transformers and only download the weight.
    #trust_remote_code=False,

    torch_dtype=torch.float16, # if newer gpu: bfloat16

    # https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention
    attn_implementation="flash_attention_2" # Works with llama model

    cache_dir = cache_dir
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=False)

In [None]:
## Load the Model and Tokenizer for Unsloth

## Loading checks

In [None]:
# Check there are no parameter overflowing onto cpu (meta)
# Making sure all of the parameter are in GPU not in CPU
for n, p in model.named_parameters():
  if p.device.type == "meta":
    print(f"{n} is on meta")

In [None]:
print(model.config.max_position_embeddings)

#eos = end of sequence
# https://huggingface.co/docs/transformers/en/pad_truncation
# very important for pad and eos use: https://www.natebrake.com/blog/llm/end-of-sequence-explained
print(model.congit.eso_token_id)

In [None]:
## Prepare for LoRA fine-tuning
def print_trainable_parameters(model):
  """
  Print the number of trainable parameters in the model and lists whic
  """
  trainable_params = 0
  non_trainable_params = 0
  all_params = 0

  print("Trainable Parameters:")
  for name, param in model.named_parameters():
    # https://www.geeksforgeeks.org/python-pytorch-numel-method/
    # Total no of all parameters (trainable + non trainable)
    all_params += param.numel() #PyTorch torch.numel() method returns the total number of elements in the input tensor.

    # source: copilot: ask about param.requires_drad
    # When requires_grad is set to True, it indicates that the parameter participates in gradient computation during backpropagation (i.e., it’s trainable).
    #When requires_grad is set to False, the parameter is excluded from gradient updates during training (i.e., it’s frozen).
    if param.requires_grad:
      trainable_params += param.numel()
      print(f"  {name} ")
    else:
      non_trainable_params += param.numel()

  # This part is same as else portion above but just for printing we did it again
  print("\nNon_Trainable Parameters")
  for name, param in model.named_parameters():
    if not param.requires_grad:
      print(f" {name} ")


  print(
      f"\nSummary:\n Trainable params: {trainable_params}\n Non-Trainable params:{non_trainable_params}"
  )

## Standard LoRA or DoRA