## Quantize Llama2-7B model and Fine Tune  on QA Datasets
An educational purpose notebook to understand Quantization of models and usage of them on to fine tune with QA datasets.

In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

Installing Libraries first

In [3]:
!pip install transformers torch datasets trl peft BitsAndBytes accelerate tiktoken

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.5.0-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.1/88.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting BitsAndBytes
  Downloading bitsandbytes-0.41.0-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hC

Passing our hf token to get access to llama2

In [4]:
import os
os.environ['HF_TOKEN'] = ''

from huggingface_hub import login
login(token = '')


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Imports

In [5]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
train_dataset = load_dataset("tatsu-lab/alpaca", split="train")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

- Set tokenizer to llama2 7B

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
        'meta-llama/Llama-2-7b-hf',
        trust_remote_code=True, model_max_length=512 ,padding='max_length', truncation=True
    )
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Creating encoding pipeline for dataset

In [8]:
def tokenize_and_encode(examples):
    return tokenizer(examples['instruction'], padding='max_length', truncation=True)

tokenized_datasets = train_dataset.map(tokenize_and_encode, batched=True)
tokenized_datasets

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask'],
    num_rows: 52002
})

- Setting bnb_config for quantization, peft_config for parameter efficient fine tuning

In [14]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    #bnb_4bit_quant_type="nf4",
    #bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,#testing nested quantization for for more memory efficient inference
)


model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        quantization_config=bnb_config,
        device_map={"": 0}
    )
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_int8_training(model)


peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="causal_lm",
    )
model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:
#to push quantized model to hub
#model.push_to_hub("Llama-2-7b-hf-4bit")

Training on dataset

In [None]:
training_args = TrainingArguments(
        output_dir="./llama2-7b-tuned",
        overwrite_output_dir=True,
        per_device_train_batch_size=4,
        optim="adamw_torch",
        learning_rate=2e-4,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        num_train_epochs=1,
        save_strategy="epoch",
        logging_dir="./logs",
        remove_unused_columns=False,
        logging_steps=100,
        fp16=True,
    )


def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset = tokenized_datasets,
        tokenizer=tokenizer,
        dataset_text_field="instruction",
        max_seq_length=1024,
        peft_config=peft_config,
    )

In [None]:
#trainer.train()
#trainer.save_model()
#trainer.push_to_hub()


https://towardsdatascience.com/4-bit-quantization-with-gptq-36b0f4f02c34

https://huggingface.co/docs/transformers/main_classes/quantization#use-nested-quantization-for-more-memory-efficient-inference