## Packages

# Preprocessing

In [None]:
!python -m pip install --upgrade pip -q
!pip install transformers  -q -U
!pip install bitsandbytes  -q -U
!pip install peft  -q -U
!pip install accelerate  -q -U
!pip install flash  -q -U
!pip install  datasets -q -U
!pip install  scipy -q -U
!pip install  trl -q -U
!pip install  hf_transfer -q -U
!pip install  huggingface_hub -q -U
!pip install  wandb -q -U

In [None]:
!transformers-cli env

In [None]:
## Unsloth install

In [None]:
# # # Empty VRAM
# del model

# # del trainer
# import gc
# gc.collect()
# gc.collect()

## Load Model

In [None]:
# For gated models on HuggingFace
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
%env HF_HUB_ENABLE_HF_TRANSFER = True # for high speed downloading and uploading to hugging face hub

In [None]:
cache_dir = '' # comment out if Google Drive is aset as cache_dir

# base model (Unsupervised Trial)
model_id = "openchat/openchat_3.5"

In [None]:
## Load the model and Tokenizer of LoRA or DoRA
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # if newer gpu: bfloat16
)

In [None]:
# config = AutoConfig.from_pretrained(model_id)
# cofig.max_position_embeddings = 4096 # (input + output) #model will only learn from max 4096 sequence of token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #config=config,

    quantization_config=bnb_config,

    #rope_scaling={"type":linear, "factor": 2.0}, # roPE scaling: https://www.hopsworks.ai/dictionary/rope-scaling and https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/preparing_model

    #https://huggingface.co/docs/accelerate/v0.25.0/en/concept_guides/big_model_inference
    # device_map='auto', # It’s fully possible to create your own device map for the layers to use as well, specifying the GPU device to use (a number), "cpu", or "disk" and pass this in:
    device_map = {"": 0}, # above auto wasnot working

    # Here, the "trust_remote_code=True" means "download the model code from huggingface repo 'internlm/internlm-chat-7b'", along with the weight, and run it. If it's False, the library would use builtin model architectures hardcoded in huggingface/transformers and only download the weight.
    #trust_remote_code=False,

    torch_dtype=torch.float16, # if newer gpu: bfloat16

    # https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention
    # attn_implementation="flash_attention_2", # Works with llama model

    cache_dir = cache_dir
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=False)

In [None]:
## Load the Model and Tokenizer for Unsloth

## Loading checks

In [None]:
# Check there are no parameter overflowing onto cpu (meta)
# Making sure all of the parameter are in GPU not in CPU
for n, p in model.named_parameters():
  if p.device.type == "meta":
    print(f"{n} is on meta")

In [None]:
print(model.config.max_position_embeddings)

#eos = end of sequence
# https://huggingface.co/docs/transformers/en/pad_truncation
# very important for pad and eos use: https://www.natebrake.com/blog/llm/end-of-sequence-explained
print(model.config.eos_token_id)

In [None]:
## Prepare for LoRA fine-tuning
def print_trainable_parameters(model):
  """
  Print the number of trainable parameters in the model and lists whic
  """
  trainable_params = 0
  non_trainable_params = 0
  all_params = 0

  print("Trainable Parameters:")
  for name, param in model.named_parameters():
    # https://www.geeksforgeeks.org/python-pytorch-numel-method/
    # Total no of all parameters (trainable + non trainable)
    all_params += param.numel() #PyTorch torch.numel() method returns the total number of elements in the input tensor.

    # source: copilot: ask about param.requires_drad
    # When requires_grad is set to True, it indicates that the parameter participates in gradient computation during backpropagation (i.e., it’s trainable).
    #When requires_grad is set to False, the parameter is excluded from gradient updates during training (i.e., it’s frozen).
    if param.requires_grad:
      trainable_params += param.numel()
      print(f"  {name} ")
    else:
      non_trainable_params += param.numel()

  # This part is same as else portion above but just for printing we did it again
  print("\nNon_Trainable Parameters")
  for name, param in model.named_parameters():
    if not param.requires_grad:
      print(f" {name} ")


  print(
      f"\nSummary:\n Trainable params: {trainable_params}\n Non-Trainable params:{non_trainable_params}"
  )

## Standard LoRA or DoRA

In [None]:
print(model)

Important documentaion for large model faster training.

https://huggingface.co/docs/transformers/v4.18.0/en/performance

In [None]:
from peft import prepare_model_for_kbit_training

# look at gradient checkpointing and gradient accumulation on https://huggingface.co/docs/transformers/v4.18.0/en/performance
model.gradient_checkpointing_enable() # to save some memory in VRAM in turn for little slow training

model = prepare_model_for_kbit_training(model) # for quantization, must be uncommented.

from peft import LoraConfig, get_peft_model

# Understaing Lora parameters: https://medium.com/@drishtisharma96505/comparative-analysis-of-lora-parameters-on-llama-2-with-flash-attention-574b913295d4
peft_config = LoraConfig( #matching the Llama recipe
                         r = 8,
                          lora_alpha = 32,
                          target_modules = [
                              "q_proj",
                              "k_proj",
                              "v_proj",
                              "o_proj",
                              # "self_attn.rotary_emb.inv_freq",

                              ## comment out 3 below for mixtril
                              "gate_proj",
                              "up_proj",
                              "down_proj",

                              # "lora_magnitude_vector" # required for DoRA,
                              # "input_layernorm.weight",
                              # "post_attention_layernorm.weight",
                              # "model.norm.weight",
                              # "lm_head.weight",


                              # "dense_h_to_4h",  #for falcon
                              # "dense_4h_to_h",  #for falcon
                              # "query_key_value",  #for falcon
                              # "dense" #for falcon
                          ],
                          lora_dropout = 0.1,
                          bias = "none",
                          task_type="CAUSAL_LM"
                          )

model = get_peft_model(model, peft_config) #move to a peft model

In [None]:
# print_trainable_parameters(model)

In [None]:
## Unsloth LoRA

## Set up Tokenizer and Padding

In [None]:
print(tokenizer)
print(tokenizer.vocab_size)

In [None]:
print(tokenizer.bos_token) #check begining of sequence
print(tokenizer.eos_token) # end of sequence

In [None]:
# # Optionally set the chat template manually.
# tokenizer.chat_template = "{ if not add_generation_prompt is defined %}"

# Test the chat template
messages = [
    {'role': 'user', 'content': "write a quick sort algorithm in python"},
    {'role': 'assistant', 'content': "here your are"},
    {'role': 'user', 'content':"great."}
]

# When you set tokenize=False in the tokenizer.apply_chat_template() function, it means that the resulting chat template output will not be tokenized into individual tokens. Instead, it remains as a single string without any tokenization. This can be useful when you want to keep the entire chat history intact for further processing or analysis. 😊
inputs = tokenizer.apply_chat_template(messages, tokenize = False)
print(inputs)

In [None]:
# very important for pad and eos use: https://www.natebrake.com/blog/llm/end-of-sequence-explained
# Choosing pad_token for tokenizer

## Option A - set the pad token to <pad>, if not <|pad|>, if not <unk> if
if '<pad>' in tokenizer.get_vocab():
  print('<pad> token is in the tokenizer. Using <pad> for pad')
  #set the pad token
  tokenizer.pad_token = '<pad>'
elif '<|pad|>' in tokenizer.get_vocab():
  print('<|pad|> token is in the tokenizer. Using <|pad|> for pad')
  #set the pad token
  tokenizer.pad_token = '<|pad|>'
elif '<unk>' in tokenizer.get_vocab():
  print('<unk> token is in the tokenizer. Using unk for pad')
  # Set the pad token
  tokenizer.pad_token = '<unk>'
else: # choosing eos_token as pad_token may be risky.
  print(f'Using EOS token, {tokenizer.eos_token}, for padding.')
  tokenizer.pad_token = tokenizer.eos_token


# ## OPTION B - create pad token
# # Check if the pad token is already in the tokenizer vocabulary
# if '<pad>' not in tokenizer.get_vocab():
#   print('pad token not in the tokenizer, adding a <pad> token')

#   #Add the pad token
#   tokenizer.add_tokens(['<pad>'])
#   # set the pad token
#   tokenizer.pad_token = '<pad>'
#   # Resize token embeddings
#   model.resize_token_embeddings(tokenizer.vocab_size)

In [None]:
# Update pad token id in model and it's config
model.pad_token_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.pad_token_id == tokenizer.pad_token_id

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print('Number of tokens now in tokenizer:', tokenizer.vocab_size)

In [None]:
print("Special tokens map:", tokenizer.special_tokens_map)
print( "All special tokens:", tokenizer.all_special_tokens)

In [None]:
tokenizer.padding_side = 'right'

In [None]:
# # Uncomment to switch to left padding, not recommended for unsloth
# tokenizer.padding_side = 'left # left padding is ususally not good idea for most model, but some use cases it may be useful

In [None]:
print(tokenizer)

## Set embed and norms layers to trainable (recommended only for chat fine tuning if your changing the template or changing the context length)

In [None]:
# TODO

## Set up Evaluation

- optional

In [None]:
from transformers import TextStreamer
from peft import PeftModel
import torch
import gc  # import Python's garbage collection module

# Define a stream
def Stream(user_prompt, model_type, tokenizer, checkpoint=''):

  if model_type == 'base':
    eval_model = model
  elif model_type == 'fine-tuned':
    eval_model = PeftModel.from_pretrained(model, checkpoint)
    eval_model = eval_model.to("cuda") # compute in GPU

    for n, p in eval_model.named_parameters():
      if p.device.type == "cpu":
        print(f"{n} is on CPU!")
  else:
    print("You must set the model_type to base or fine-tuned")

  # print (f'Proceeding to inference with peft adapters from {checkpoint}')

  # Source: chatgpt: model.config.use_cache = True
  # The use_cache option allows the model to cache intermediate hidden states and attention weights as it generates tokens.
  # This cache helps speed up subsequent token generation by reusing previously computed information.
  # If you’re generating long sequences or performing autoregressive tasks (where each token depends on previous tokens), enabling cache can significantly improve decoding speed.
  eval_model.config.use_cache = True

  messages = [
      # strip() returns new string with extra(unwanted) white space removed
      {'role': 'user', 'content': f"{user_prompt.strip()}",}
  ]

  # add generation prompt must be true for giving ai where to start it's geneartion from in chat prompt eg: this will add at the end: <|im_start|>assistant
  inputs = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=True)

  # "pt" means pytorch tensors When you set return_tensors="pt", the tokenizer or model returns the tokenized input as PyTorch tensors.
  #These tensors can be directly used for model inference or fine-tuning.
  inputs = tokenizer([inputs], return_tensors="pt", add_special_tokens=False)

  # there will be token_type_ids in the end of prompt like Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  # these are useful for many places but not here.
  if "token_type_ids" in inputs: # we don't token_type_ids here
    del inputs["token_type_ids"]

  streamer = TextStreamer(tokenizer)

  print(f'eval_model is on:',{next(eval_model.parameters()).device}) # CPu or CUDA
  print(f'input_ids are on: {inputs["input_ids"].device}')

  # parameter of .generate: https://huggingface.co/docs/transformers/en/main_classes/text_generation
  _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=10, use_cache=True)

  # Clear GPU cache and run garbage collection
  torch.cuda.empty_cache() # Clear GPU cache
  gc.collect() # Run garbage collection

def evaluation(model_type, tokenizer, checkpoint=''):
  questions = [
    "In the context of Touch Rugby Internation Rules 2020, what does the dead ball line marks?",
    "How many players are on the field on each team in touch rugby?",
    "In touch rugby, does a forward pass result in a roll ball or a Penalty",
    "In touch rughby, how long is half time?"
    "In touch rugby, how does the game commence?"
    "In touch rugby, how many points is a try worth?"
    ""
  ]

  answers = [
      " The Dead ball line marks the end boundaries of the field of play",
      "6 players",
      "Penalty",
      "5 minutes",
      "The game begins with a tap on the halfway line"
      "1 point"
  ]

  for question, answer in zip(questions, answers):
    Stream(question, model_type, tokenizer, checkpoint)
    print("Correct Answer:", answer)
    print('\n\n')

In [None]:
print(model.config)

In [None]:
print(model.generation_config)

In [None]:
# checking the base model which hasnot been fine tunned
evaluation("base", tokenizer)

## Load the Dataset

In [None]:
from datasets import load_dataset

dataset = 'Trelis/touch-rugby-rules-memorisation'

data = load_dataset(dataset)

In [None]:
# Print frist row of 'train and 'test'
print("First row of train:", data['train'][1])
print("First row of test:", data['test'][0])

In [None]:
# Extract text from the first row of 'test' in data
text = data['train'][0]['messages']

# Tokenize the text
tokens = tokenizer.encode(text, add_special_tokens = True)

# Decode back to text
decoded_text = tokenizer.decode(tokens)

# Print the tokens and decode text
print("Token IDs:", tokens)
print("Decode Text:", decoded_text)

# Train

## Set up and run Training (with saving of data logs to Drive)

using TRL trainer is recommended.

### TRL Trainer

In [None]:
model_name = model_id.split("/")[-1]
dataset_name = dataset.split("/")[-1]

#parameters
epochs = 1 # 1 epochs is good enough here
context_length = 512 # most of the time Q and A arenot longer than 512

# backpropagation params
grad_accum = 1 # virtually increase the batch size. Maynot affect VRam but increase Training time
batch_size = 1 # just granuale update # smooth and less update for help in memorization

fine_tune_tag = 'touch_rugby-rules'
save_dir = f'./results/{model_name}_{dataset_name}_{epochs}_epochs_{context_length}_length_{grad_accum}_grad_accum_{batch_size}_batch_size_{fine_tune_tag}'
print(save_dir)

In [None]:
# #Custom callback for just logging
# import transformers
# import os

# # custom callback to log metrics
# class LoggingCallback(transformers.TrainerCallback):
#   def _init_(self, log_file_path):
#     self.log_file_path = log_file_path
#     self.save_dir = save_dir

#   def on_log(self, args, state, control, model = None, **kwargs):
#     with open(self.log_file_path, 'a') as f:
#       if 'loss' in loss:
#         f.write(f"Step: {state.global_step}, Training Loss: {logs["loss"]}")
#       if 'eval_loss' in loss:
#         f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}")

#       f.flush() # Force flush the buffered data to file

#     # Check if the current step is a checkpoint step
#     if state.global_step % int(args.save_steps) == 0:
#       # Check if the last checkpoint path exists
#       if state.best_model_checkpoint:
#         checkpoint_dir = state.best_model_checkpoint
#       else:
#         # if not, construct the checkpoint directory path manually
#         checkpoint_dir = os.path.join(args.output_dir, f"checkpoint")

#       #Ensure the checkpoint directory exist
#       os.makedirs(checkpoint_dir, exist_ok=True)

#       # Save trainable params in the checkpoint directory
#       current_trainable_params = {n: p for n, p in model.named_parameters()}
#       current_trainable_params_state_dict = {n:p.data for n, p in current_trainable_params}

#       file_path = os.path.join(checkpoint_dir, "trainable_params.bin")
#       torch.save(current_trainable_params_state_dict, file_path)

# # log file path
# log_file_path = os.path.join(cache_dir, "training_logs.txt")

# # Creating an instance of custom callback class
# logging_callback = LoggingCallback(log_file_path)

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from transformers import Trainer
from transformers import TrainingArguments
from trl import SFTTrainer

trainer = SFTTrainer(
    # peft_config = peft_config # not needed where as look at above we have already put peft config directly into model but we comment it out
    dataset_text_field = "messages", # key fied = "messages" in dataset in key , value pair
    max_seq_length = context_length, # max length of query
    tokenizer = tokenizer,
    model=model.to("cuda"),
    train_dataset = data["train"],
    eval_dataset = data["test"],

    # about all the parameters: https://huggingface.co/docs/transformers/en/main_classes/trainer
    args = TrainingArguments(
        max_steps =1, # comment this out after first time you run.
        save_steps = 50, ### make sure to check this value is good for our data, The save_steps parameter specifies the number of training steps between consecutive model checkpoints.
        num_train_epochs = epochs,
        output_dir = save_dir,
        evaluation_strategy = "steps", # evaluation is done in every eval_steps
        do_eval= True,
        eval_steps = 0.2,
        per_device_eval_batch_size = batch_size,
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = grad_accum,
        log_level ="debug",
        optim = "adamw_torch", # if quantization
        fp16 = True, # for low end non_ampere Gpu
        #bf16 = True, # for only ampere GPU
        max_grad_norm = 0.3,#The max_grad_norm value represents the maximum allowed norm (magnitude) of the gradients during backpropagation.. By setting a maximum norm, you prevent gradients from becoming too large, which can lead to unstable training or divergence.

        # from chatgpt
        # here cosine will be game changer as it decrease validataion loss and against overfitting
        lr_scheduler_type = "cosine", # follow cosine shaped curve. cosine shape curve make sure lr decrease ove steps.
        hub_private_repo = False,

        # from chatgpt
        # Warmup is an initial phase where the learning rate gradually increases from a very small value to its regular value. (0 to lr)
        # It helps stabilize training and allows the model to explore the loss landscape more effectively.
        # If you set warmup_ratio = 0.03 and T_max = 1000, the warmup phase will last for the first 30 steps (3% of the total).
        warmup_ratio = 0.03,
        # optim = "adamw_torch", # commented for LoRA+, we are using lora so needed
        learning_rate= 1e-4, # comment for LoRA +
        report_to="tensorboard",
    ),
    # ,callbacks = [logging_callback], # if custom callback created
    # optimizers = (optimizer, None) # for only LoRA +
    # neftune_noise_alpha = 5 # Add in noise embeddings to improve performance
)

In [None]:
model.config.use_cache = False # for silencing warnings only
trainer.train()

# Plotting

In [None]:
pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

# Initialize list to hold training and evaluation losses and steps
train_losses = []
eval_losses = []
train_steps = []
eval_steps = []

#Populate the list from log history
#import pandas as pd
# pd.DataFrame(trainer.state.log_history)
for entry in trainer.state.log_history:
  if 'loss' in entry:
    train_losses.append(entry['loss'])
    train_steps.append(entry['step'])
  if 'eval_loss' in entry:
    eval_losses.append(entry['eval_loss'])
    eval_steps.append(entry['step'])

# plot the losses
plt.plot(train_steps, train_losses, label = 'Train Loss')
plt.plot(eval_steps, eval_losses, label = 'eval Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Evaluate After Training

In [None]:
# # Can set to true for faster inference
# model.config.use_cache = True

In [None]:
evaluation("base", tokenizer) # use this if trained using adapter