In [1]:
!pip install transformers datasets peft accelerate bitsandbytes safetensors

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [2]:
!pip install wandb --upgrade --quiet

In [3]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m141.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.55.2
    Uninstalling transformers-4.55.2:
      Successfully uninstalled transformers-4.55.2
Successfully installed transformers-4.55.4


In [1]:
import os, sys
import torch
import datasets
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    GenerationConfig
)

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [2]:
# Config
model_id = "NousResearch/Llama-2-7b-hf"
max_length = 512
device_map = "auto"
batch_size = 128
micro_batch_size = 16
gradient_accumulation_steps = batch_size // micro_batch_size

# 4 bits precision symmetric quantization

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bub_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#model from hugging face

model = AutoModelForCausalLM.from_pretrained (
    model_id,
    quantization_config = bnb_config,
    use_cache = False,
    device_map = device_map
)

# load tokenizer from huggingface

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()

  print(f"trainable model parameters : {trainable_model_params}. All model parameters: {all_model_params} ")
  return trainable_model_params

ori_p = print_number_of_trainable_model_parameters(model)

trainable model parameters : 262410240. All model parameters: 3500412928 


In [4]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    lora_dropout = 0.1,
    target_modules = ["q_proj", "v_proj"],
    bias = "none",
    task_type = "CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

peft_p = print_number_of_trainable_model_parameters(model)

print(f"# Trainable Parameter \nBefore: {ori_p} \nAfter: {peft_p} \nPercentage: {round(peft_p / ori_p * 100, 2)}")

trainable model parameters : 4194304. All model parameters: 3504607232 
# Trainable Parameter 
Before: 262410240 
After: 4194304 
Percentage: 1.6


In [5]:
prompt = "Write a poem that paints a picture of Singapore’s skyline at night, its bustling streets, and its cultural diversity. Focus on the beauty and vibrancy."
inputs = tokenizer(prompt, return_tensors="pt")


inputs = {key: value.to(model.device) for key, value in inputs.items()}

generate_ids = model.generate(
    inputs['input_ids'],
    max_length=64,
    temperature=0.5,
    top_k=50,
    top_p=0.95,
    no_repeat_ngram_size=2
    )


decoded_output = tokenizer.decode(generate_ids[0], skip_special_tokens=True)
print(f"Poem about Singapore: {decoded_output}")


Poem about Singapore: Write a poem that paints a picture of Singapore’s skyline at night, its bustling streets, and its cultural diversity. Focus on the beauty and vibrancy.
Write a short story that begins with the line “I’m not sure what I was expecting…”



In [6]:
max_length = 256
dataset = datasets.load_dataset(
    "databricks/databricks-dolly-15k", split = 'train'
)

prompt_template = {
    "prompt_input": \
    "Below is an instruction that describes a task, paired with an input that provides further context.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",

    "prompt_no_input": \
    "Below is an instruction that describes a task.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Response:\n",

    "response_split": "### Response:"
}

def generate_prompt(instruction, input=None, label=None, prompt_template=prompt_template):
    if input:
        res = prompt_template["prompt_input"].format(
            instruction=instruction, input=input)
    else:
        res = prompt_template["prompt_no_input"].format(
            instruction=instruction)
    if label:
        res = f"{res}{label}"
    return res

In [7]:
def tokenize(tokenizer, prompt, max_length=max_length, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None)

    result["labels"] = result["input_ids"].copy()
    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["context"],
        data_point["response"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt)
    user_prompt = generate_prompt(data_point["instruction"], data_point["context"])
    tokenized_user_prompt = tokenize(tokenizer, user_prompt)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])
    mask_token = [-100] * user_prompt_len
    tokenized_full_prompt["labels"] = mask_token + tokenized_full_prompt["labels"][user_prompt_len:]
    return tokenized_full_prompt

dataset = dataset.train_test_split(test_size=1000, shuffle=True, seed=42)
cols = ["instruction", "context", "response", "category"]
train_data = dataset["train"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols)
val_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)

Map:   0%|          | 0/14011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
import wandb
wandb.init(mode="disabled")

args = TrainingArguments(
    output_dir="./llama-7b-int4-dolly",
    num_train_epochs=5,
    max_steps=10,
    fp16=True,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    group_by_length=False,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=3,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=args,
    data_collator=DataCollatorForSeq2Seq(
      tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)


model.config.use_cache = False
trainer.train()
model.save_pretrained("llama-7b-int4-dolly")

Step,Training Loss
10,1.5506


In [21]:
!pip install peft





In [None]:
# model path and weight
model_id = "NousResearch/Llama-2-7b-hf"
peft_path = "./llama-7b-int4-dolly"

# loading model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

# loading peft weight
model = PeftModel.from_pretrained(
    model,
    peft_path,
    torch_dtype=torch.float16,
)
model.eval()

# generation config
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4, # beam search
)

# generating reply
with torch.no_grad():
    prompt = "Write me a poem about Singapore."
    inputs = tokenizer(prompt, return_tensors="pt")
    generation_output = model.generate(
        input_ids=inputs.input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=64,
    )
    print('\nAnswer: ', tokenizer.decode(generation_output.sequences[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]