<a href="https://colab.research.google.com/github/RahulDhanvi/Lab10/blob/master/Llama2_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bitsandbytes
!pip install datasets
!pip install peft
!pip install trl
!pip install deepspeed
!pip install loralib
!pip install accelerate

import os
from openpyxl import Workbook
import torch
import datasets
from datasets import load_dataset
from transformers import (
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig,
  HfArgumentParser,
  TrainingArguments,
  pipeline,
  logging,
  Trainer,
  DataCollatorForLanguageModeling,
  LlamaForCausalLM,
  CodeLlamaTokenizer,
  DataCollatorForSeq2Seq
)
from torch.utils.data import DataLoader
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer, DPOTrainer
from datasets import Dataset, DatasetDict
import pandas as pd
from accelerate import infer_auto_device_map
import random
from copy import deepcopy
import gc
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
import argparse

def train_llama(train_set_path: str,
                test_set_path: str,
                save_path: str,
                result_path: str
                ):
  #os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
  model_name = "codellama/CodeLlama-7b-Instruct-hf"
  #new_model = "llama-2-7b-Instruct"
  #model_name = "meta-llama/Meta-Llama-3-8B"
  system_prompt = "You are a tutor specializing in the knowledge of OpenROAD, the open-source EDA tool. You will be asked about general OpenROAD questions and OpenROAD Python API-related questions."
  ################################################################################
  # QLoRA parameters
  ################################################################################
  # LoRA attention dimension
  lora_r = 8
  # Alpha parameter for LoRA scaling
  lora_alpha = 16
  # Dropout probability for LoRA layers
  lora_dropout = 0.1 #0.1
  ################################################################################
  # bitsandbytes parameters
  ################################################################################
  # Activate 4-bit precision base model loading
  use_4bit = True
  # Compute dtype for 4-bit base models
  # Quantization type (fp4 or nf4)
  bnb_4bit_quant_type = "nf4"
  # Activate nested quantization for 4-bit base models (double quantization)
  use_nested_quant = False
  ################################################################################
  # TrainingArguments parameters
  ################################################################################
  # Output directory where the model predictions and checkpoints will be stored
  output_dir = "./" + save_path.split("/")[-1] + "_log"
  # Number of training epochs
  num_prompt_train_epochs = 14#3
  # Enable fp16/bf16 training (set bf16 to True with an A100)
  fp16 = True
  bf16 = False
  # Batch size per GPU for training
  per_device_train_batch_size = 1
  # Batch size per GPU for evaluation
  per_device_eval_batch_size = 1
  # Number of update steps to accumulate the gradients for
  gradient_accumulation_steps = 1
  # Enable gradient checkpointing
  gradient_checkpointing = True
  # Maximum gradient normal (gradient clipping)
  max_grad_norm = 0.3
  # Initial learning rate (AdamW optimizer)
  learning_rate = 2e-4
  # Weight decay to apply to all layers except bias/LayerNorm weights
  weight_decay = 0.001
  # Optimizer to use
  optim = "paged_adamw_32bit"
  #optim = "paged_adamw_8bit"
  #optim = "adamw_8bit"
  # Learning rate schedule (constant a bit better than cosine)
  lr_scheduler_type = "constant"
  # Number of training steps (overrides num_train_epochs)
  max_steps = -1
  # Ratio of steps for a linear warmup (from 0 to learning rate)
  warmup_ratio = 0.03
  # Group sequences into batches with same length
  # Saves memory and speeds up training considerably
  group_by_length = True
  # Save checkpoint every X updates steps
  save_steps = 50000
  # Log every X updates steps
  logging_steps = 25
  ################################################################################
  # SFT parameters
  ################################################################################
  # Maximum sequence length to use
  max_seq_length = 2000
  # Pack multiple short examples in the same input sequence to increase efficiency
  packing = False
  # Load the entire model on the GPU 0
  #device_map = {"q_proj": 0, "up_proj": 1, "o_proj": 2, "k_proj": 3,
  #              "down_proj": 4, "gate_proj": 5, "v_proj": 5}
  device_map = {"": 0}
  device_map_1 = {"": 1}

  compute_dtype = torch.bfloat16

  prompt_set = pd.read_excel(train_set_path,'Sheet',header=None)
  prompt_set = prompt_set.drop(prompt_set.index[0])
  prompt_set = prompt_set.reset_index(drop = True)
  prompt_set = prompt_set.rename(columns={0: "code", 1: "prompt"})
  prompt_set_new = pd.DataFrame(columns=["text"])

  test_set = pd.read_excel(test_set_path,'Sheet',header=None)
  test_set = test_set.rename(columns={0: "code", 1: "prompt"})

  code_test_set = Workbook()
  code_test_set_iter = code_test_set.active
  code_test_set_iter["C1"] = "generate by finetuned llama"
  code_test_set_iter["B1"] = "code from dataset"
  code_test_set_iter["A1"] = "prompt"

  iter_ = 2
  for i in range(len(test_set)-1):
    code_test_set_iter["A"+str(iter_)] = test_set["prompt"][i+1]
    code_test_set_iter["B"+str(iter_)] = test_set["code"][i+1]
    iter_ += 1

  for i in range(len(prompt_set)):
    if i < 366:
      prompt_set_new.loc[i, "text"] = "<s>[INST] <<SYS>> "+system_prompt+" <</SYS>> Please answer the following OpenROAD Python API-related questions:\n"+ prompt_set["prompt"][i]+"\n[/INST]\n\nBelow is the Python code using the OpenROAD Python APIs\n"+ prompt_set["code"][i] + "\n</s>"
    else:
      prompt_set_new.loc[i, "text"] = "<s>[INST] <<SYS>> "+system_prompt+" <</SYS>> Please answer the following general questions related to OpenRoad:\n"+ prompt_set["prompt"][i]+"\n[/INST]\n"+ prompt_set["code"][i] + "\n</s>"

  prompt_dataset = Dataset.from_pandas(prompt_set_new)
  del prompt_set
  del prompt_set_new

  gc.collect()

  bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
  )

  ##Load LoRA configuration
  peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
      "q_proj",
      "up_proj",
      "o_proj",
      "k_proj",
      "down_proj",
      "gate_proj",
      "v_proj"
    ]
  )

  prompt_training_arguments = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = num_prompt_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    logging_dir='./logs/',
    weight_decay = weight_decay,
    fp16 = fp16,
    bf16 = bf16,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = group_by_length,
    lr_scheduler_type = lr_scheduler_type,
    remove_unused_columns = True,
  )

  #########################################
  #Train the model only on prompt code set#
  #########################################
  #Only trained on prompt
  # Load LLaMA tokenizer
  tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding=True,
    truncation=True,
    device_map = "balanced_low_0"#device_map_1
  )

  special_tokens_dict = {"mask_token": "<[mask]>"}
  add_token_list = ["<[begin_of_python]>", "<[end_of_python]>", "<[begin_of_api]>", "<[end_of_api]>", "(", ")", "."]
  tokenizer.add_special_tokens(special_tokens_dict)
  tokenizer.add_tokens(add_token_list)

  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.pad_token_id = tokenizer.eos_token_id
  tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

  print("=========================")
  print(tokenizer.eos_token)
  print("=========================")

  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = compute_dtype,
    device_map = "balanced_low_0",
    #attn_implementation="flash_attention_2"
  )
  model.config.use_cache = False
  model.config.pretraining_tp = 1
  model.resize_token_embeddings(len(tokenizer))

  trainer = SFTTrainer(
    model = model,
    train_dataset = prompt_dataset,
    peft_config = peft_config,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = prompt_training_arguments,
    packing = packing,
    dataset_batch_size = 1,
  )
  print("========================OpenROAD python prompt training=======================")
  trainer.train()
  print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

  ###############################################################
  #Test CodeLlama performance after fintuning only on prompt set#
  ###############################################################

  new_model = save_path

  trainer.model.save_pretrained(new_model, save_embedding_layers = True, safe_serialization=False, from_pt=True)

  #peft_model = get_peft_model(model, peft_config)
  #peft_model.config.save_pretrained(new_model + '/model')

  #trainer.model.merge_and_unload()

  #model.save_pretrained(new_model + '/model', save_embedding_layers = True)

  tokenizer.save_pretrained(new_model + '/tokenizer')
  pipe = pipeline(task = "text-generation",
                  model = model,
                  tokenizer = tokenizer,
                  max_new_tokens = 2 * max_seq_length,
                  pad_token_id = tokenizer.eos_token_id,
                  Truncation = True
                  )
  # Test code generation
  for i in range(len(test_set)-1):
    prompt = code_test_set_iter["A" + str(i + 2)].value

    if i < 30:
      prompt = "<s>[INST] <<SYS>> "+system_prompt+" <</SYS>>\nPlease answer the following OpenROAD Python API-related questions:\n"+ prompt+"\n[/INST]\n"
    else:
      prompt = "<s>[INST] <<SYS>> "+system_prompt+" <</SYS>>\nPlease answer the following general questions related to OpenRoad:\n"+ prompt+"\n[/INST]\n"
    result = pipe(prompt)
    code_test_set_iter["C" + str(i + 2)] = result[0]['generated_text'][len(prompt):]
    print("finish...(%d/%d)"%(i+1, len(test_set)-1))
  code_test_set.save(result_path)

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (4

In [None]:
train_llama(train_set_path = 'Testset_not_aug_QA.xlsx',
              test_set_path = 'Trainset_not_aug_QA.xlsx',
              save_path = 'llama_finetuned_not_aug_QA_model',
              result_path = 'Testset_not_aug_result_fintuning_QA.xlsx')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

</s>


config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 