<a href="https://colab.research.google.com/github/Satyake/Open-AI-LLM-Langchain-Projects/blob/main/Zephyr_QLORA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install datasets transformers trl peft accelerate bitsandbytes auto-gptq optimum

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.5 MB/s[

In [2]:
#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
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, GPTQConfig
from trl import SFTTrainer




In [4]:
class Config:
  MODEL_ID="TheBloke/zephyr-7B-alpha-GPTQ"
  DATASET_ID= "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
  CONTEXT_FIELD=""
  INSTRUCTION_FIELD="instruction"
  TARGET_FIELD="response"
  BITS=4
  DISABLE_EXLLAMA=True
  DEVICE_MAP='auto'
  USE_CACHE=False
  LORA_R=16
  LORA_ALPHA=16
  LORA_DROPOUT=0.05
  BIAS="none"
  TARGET_MODULES=["q_proj","v_proj"]
  TASK_TYPE="CAUSAL_LM"
  OUTPUT_DIR="zephyr-support-chatbot"
  BATCH_SIZE=8
  GRAD_ACCUMULATION_STEPS=1
  OPTIMIZER = "paged_adamw_32bit"
  LR = 2e-4
  LR_SCHEDULER = "cosine"
  LOGGING_STEPS = 50
  SAVE_STRATEGY = "epoch"
  NUM_TRAIN_EPOCHS = 1
  MAX_STEPS = 250
  FP16 = True
  PUSH_TO_HUB = True
  DATASET_TEXT_FIELD = "text"
  MAX_SEQ_LENGTH = 512
  PACKING = False

In [5]:
class ZephyrTrainer:
  def __init__(self):
    self.config=Config()
    self.tokenizer=AutoTokenizer.from_pretrained(self.config.MODEL_ID)
    self.tokenizer.pad_token=self.tokenizer.eos_token

  def process_data_sample(self, example):
    processed_example="<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + example[self.config.INSTRUCTION_FIELD] + "\n<|assistant|>\n" + example[self.config.TARGET_FIELD]
    return processed_example
  def create_dataset(self):
    data=load_dataset(self.config.DATASET_ID, split='train')
    df=data.to_pandas()
    df[self.config.DATASET_TEXT_FIELD] = df[[self.config.INSTRUCTION_FIELD, self.config.TARGET_FIELD]].apply(lambda x: self.process_data_sample(x), axis=1)
    processed_data=Dataset.from_pandas(df[[self.config.DATASET_TEXT_FIELD]])
    return processed_data

  def prepare_model(self):
   bnb_config=GPTQConfig(
       bits=self.config.BITS,
       disable_exllama=self.config.DISABLE_EXLLAMA,
       tokenizer=self.tokenizer
   )
   model=AutoModelForCausalLM.from_pretrained(
       self.config.MODEL_ID,
       quantization_config=bnb_config,
       device_map=self.config.DEVICE_MAP
   )

   model.config.use_cache=self.config.USE_CACHE
   model.config.pretraining_tp=1
   model.gradient_checkpointing_enable()
   model=prepare_model_for_kbit_training(model)

   peft_config= LoraConfig(
       r=self.config.LORA_R,
       lora_alpha=self.config.LORA_ALPHA,
       lora_dropout=self.config.LORA_DROPOUT,
       bias=self.config.BIAS,
       task_type=self.config.TASK_TYPE,
       target_modules=self.config.TARGET_MODULES
   )

   model=get_peft_model(model, peft_config)
   return model, peft_config

  def set_training_arguments(self):
       training_arguments = TrainingArguments(
                                                output_dir=self.config.OUTPUT_DIR,
                                                per_device_train_batch_size=self.config.BATCH_SIZE,
                                                gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
                                                optim=self.config.OPTIMIZER,
                                                learning_rate=self.config.LR,
                                                lr_scheduler_type=self.config.LR_SCHEDULER,
                                                save_strategy=self.config.SAVE_STRATEGY,
                                                logging_steps=self.config.LOGGING_STEPS,
                                                num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
                                                max_steps=self.config.MAX_STEPS,
                                                fp16=self.config.FP16,
                                                push_to_hub=self.config.PUSH_TO_HUB
                                            )
       return training_arguments



  def train(self):
    data=self.create_dataset()
    model, peft_config=self.prepare_model()
    training_args=self.set_training_arguments()
    trainer=SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field=self.config.DATASET_TEXT_FIELD,
        args=training_args,
        tokenizer=self.tokenizer,
        packing=self.config.PACKING,
        max_seq_length=self.config.MAX_SEQ_LENGTH

        )

    trainer.train()

    trainer.push_to_hub()

In [6]:
trainer=ZephyrTrainer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/983 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

In [None]:
trainer.train()

Downloading readme:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Map:   0%|          | 0/26872 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss


In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

In [None]:
inp_str = process_data_sample(
    {
        "instruction": "i have a question about cancelling order {{Order Number}}",
    }
)


In [None]:
def process_data_sample(example):

    processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + example["instruction"] + "\n<|assistant|>\n"

    return processed_example

In [None]:
tokenizer=AutoTokenizer.from_pretrained("/content/zephyr-support-chatbot")

In [None]:
model=AutoPeftModelForCausalLM.from_pretrained(
    '/content/zephyr-support-chatbot',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='cuda'
)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 19.06 MiB is free. Process 30904 has 14.73 GiB memory in use. Of the allocated memory 14.20 GiB is allocated by PyTorch, and 402.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
generation_config=GenerationConfig(
    do_sample=True,
    top_k=1,
    temerature=0.1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
outputs=model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))