In [1]:
import torch
import time
import gc

from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM,
)

from peft import LoraConfig, PeftModel, get_peft_model
import time
from transformers import logging
logging.set_verbosity_error()

In [2]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

In [3]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,  
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_use_double_quant=True,)

In [4]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["gate_proj", "down_proj", "up_proj"]
)

In [5]:
def test(model, runs=10):
    inp = tokenizer.encode("This is a long story about Alise in wonderland.\n", return_tensors="pt").to("cuda")
    len_inp = len(inp[0])
    generated_tokens = 0
    with torch.inference_mode():
        # load model
        _ = model.generate(inp, max_new_tokens=1000, do_sample=False, num_beams=1, temperature=0, top_p=1.0)
        t_start = time.perf_counter()
        for _ in range(runs):
            out = model.generate(inp, max_new_tokens=1000, do_sample=False, num_beams=1, temperature=0, top_p=1.0)
            generated_tokens += len(out[0]) - len_inp
        t_total = time.perf_counter() - t_start
    del(inp)
    torch.cuda.empty_cache()
    gc.collect()
    return {"time_total": t_total, "generated_tokens": generated_tokens, "tokens/sec": generated_tokens / t_total}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
#Create a new token and add it to the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

---
## Load models

In [7]:
n_runs = 10

In [8]:
model = LlamaForCausalLM.from_pretrained(
            model_name,
            quantization_config=None,
            attn_implementation="sdpa",
            device_map=("cuda"),
            torch_dtype=torch.bfloat16)
model_original = torch.compile(model)

In [9]:
test(model_original, 1)



{'time_total': 9.59500615700017,
 'generated_tokens': 866,
 'tokens/sec': 90.2552834078379}

---
# Tests

In [10]:
res_1 = test(model_original, n_runs)
print("Original:\n", res_1)

Original:
 {'time_total': 96.33005910000065, 'generated_tokens': 8660, 'tokens/sec': 89.89924931957134}


In [11]:
model_w_adapter = get_peft_model(model, peft_config)
model_w_adapter = torch.compile(model_w_adapter)

In [12]:
res_2 = test(model_w_adapter, n_runs)
print("With Adapter:\n", res_2)

With Adapter:
 {'time_total': 149.31412317300055, 'generated_tokens': 8660, 'tokens/sec': 57.998532328828816}


In [13]:
model_merged = model_w_adapter.merge_and_unload()
model_merged = torch.compile(model_merged)

In [14]:
res_3 = test(model_merged, n_runs)
print("Merged:\n", res_3)

Merged:
 {'time_total': 102.24086965600145, 'generated_tokens': 8660, 'tokens/sec': 84.70193993006265}


---