## Load base model and tokenizer

In [4]:
# Load model directly
from transformers import AutoTokenizer
# dont assign to variable, because it will be larger than memory
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="cuda")

In [39]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="cpu")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# alternatively load tokenizer and model into one variable
from transformers import pipeline
pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
pipe.save_pretrained("llama-2-7b-full-precision-test")

In [5]:
import torch
device = "cpu"
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    device = "cuda"
    print(f"Available cuda device:", torch.cuda.get_device_name(), torch.cuda.get_device_properties(0))

CUDA available: True
Available cuda device: NVIDIA L40S _CudaDeviceProperties(name='NVIDIA L40S', major=8, minor=9, total_memory=45494MB, multi_processor_count=142)


## Check the datatypes of model layers

In [5]:
def check_dtypes(model):
    """
    Return a dictionary of unique dtypes and with the values of the corresponding layers that have this dtype.
    """
    uniq_dtypes = {}
    for layer_name in model.state_dict():
        curr_dtype = str(model.state_dict()[layer_name].dtype)

        if curr_dtype not in uniq_dtypes.keys():
            uniq_dtypes[curr_dtype] = [layer_name]
        else:
            uniq_dtypes[curr_dtype].append(layer_name)

    return uniq_dtypes

dtype_dict = check_dtypes(model)    
dtype_dict.keys()

dict_keys(['torch.float32'])

In [6]:
def memory_footprint(model):
    """
    Return Memory footpring in GB including buffers that do not use gradients
    """
    model.train()
    train_footprint = model.get_memory_footprint() / 1e+9
    model.eval()
    eval_footprint = model.get_memory_footprint() / 1e+9
    return {"train": train_footprint, "eval": eval_footprint}

In [7]:
memory_footprint(model)

{'train': 26.953670912, 'eval': 26.953670912}

## Convert Model Weights to ternary {-1, 0, 1}

In [4]:
from bitmat import convert_hf_model

In [None]:
model = convert_hf_model(model)
model.save_pretrained('llama-2-7b-absmean-ternary-bitmat')

NameError: name 'model' is not defined

In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [9]:
del model
torch.cuda.empty_cache()

In [41]:
from bitmat import Auto158ModelForCausalLM
model = Auto158ModelForCausalLM.from_pretrained("quantized/llama-2-7b-absmean-ternary-bitmat")

In [37]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
# dont assign to variable, because it will be larger than memory
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="cpu")

In [12]:
from transformers import pipeline
quant_pipe = pipeline(tokenizer=tokenizer, model=model)

RuntimeError: Inferring the task automatically requires to check the hub with a model_id defined as a `str`. Llama158ForCausalLM(
  (model): Llama158Model(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x Llama158DecoderLayer(
        (self_attn): Llama158SdpaAttention(
          (q_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (k_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (v_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (o_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (rotary_emb): Llama158RotaryEmbedding()
        )
        (mlp): Llama158MLP(
          (gate_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (up_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (down_proj): BitLinear(
            (norm): RMSLayerNorm()
          )
          (act_fn): SiLU()
        )
      )
    )
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
) is not a valid model_id.

In [5]:
model.device

device(type='cpu')

In [13]:
# model size on CPU
memory_footprint(model)

{'train': 14.139367424, 'eval': 2.806358464}

In [14]:
model_gpu = model.to(device)

In [15]:
# model size on GPU
memory_footprint(model_gpu)

{'train': 14.139367424, 'eval': 2.806358464}

## Check the quantized Model dtypes and quantization weights

In [11]:
import json
qunatized_model_dtypes = check_dtypes(model_gpu)
print(json.dumps(qunatized_model_dtypes, indent=4))

{
    "torch.float32": [
        "model.embed_tokens.weight",
        "model.layers.0.self_attn.q_proj.norm.weight",
        "model.layers.0.self_attn.k_proj.norm.weight",
        "model.layers.0.self_attn.v_proj.norm.weight",
        "model.layers.0.self_attn.o_proj.norm.weight",
        "model.layers.0.mlp.gate_proj.norm.weight",
        "model.layers.0.mlp.up_proj.norm.weight",
        "model.layers.0.mlp.down_proj.norm.weight",
        "model.layers.1.self_attn.q_proj.norm.weight",
        "model.layers.1.self_attn.k_proj.norm.weight",
        "model.layers.1.self_attn.v_proj.norm.weight",
        "model.layers.1.self_attn.o_proj.norm.weight",
        "model.layers.1.mlp.gate_proj.norm.weight",
        "model.layers.1.mlp.up_proj.norm.weight",
        "model.layers.1.mlp.down_proj.norm.weight",
        "model.layers.2.self_attn.q_proj.norm.weight",
        "model.layers.2.self_attn.k_proj.norm.weight",
        "model.layers.2.self_attn.v_proj.norm.weight",
        "model.layers.2.s

In [12]:
int8_layers = len(qunatized_model_dtypes['torch.int8'])
float16_layers = len(qunatized_model_dtypes['torch.float16'])
ratio = float16_layers / int8_layers
print(f"Num quantized layers: \'{int8_layers}\' with \'{float16_layers}\' quantization weight scales. There are '{ratio}' weight scales per quantized layer.")

Num quantized layers: '224' with '224' quantization weight scales. There are '1.0' weight scales per quantized layer.


In [19]:
memory_footprint(model)

{'train': 16.111116288, 'eval': 3.897303488}

## Test the inference time of the model

In [None]:
def measure_latency(model, tokenizer, ttft=False):
    device = "cuda:0"
    
    # input
    input_text = "A test is a"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    # Warm-up
    for _ in range(10):
        _ = model.generate(input_ids, max_new_tokens=1)

    if device.startswith("cuda"):
    # Measure time-to-first-token
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        if ttft:
            start_event.record()
            _ = model.generate(input_ids, max_new_tokens=1)
            end_event.record()
        else:
            start_event.record()
            tokens = model.generate(input_ids)
            end_event.record()
            print(tokens[0])
            print(len(tokens[0]))

            # Wait for GPU operations to complete
        torch.cuda.synchronize()

        latency = start_event.elapsed_time(end_event)  # Latency in milliseconds

    elif device.startswith("cpu"):
        import time
        if ttft:
            start_time = time.perf_counter()
            _ = model.generate(input_ids, max_new_tokens=1)
            end_time = time.perf_counter()
        else:
            start_time = time.perf_counter()
            tokens = model.generate(input_ids)
            end_time = time.perf_counter()
            print(tokens[0])
            print(len(tokens[0]))

        latency = (end_time - start_time) * 1000  # Convert to milliseconds

    else:
        print("Error: wrong device", device)

    if ttft:
        print(f"Time-to-first-token latency: {latency:.2f} ms")
    else:
        latency = latency / len(tokens[0])
        print(f"Latency per Token: {latency:.2f} ms")
    return latency


In [12]:
# meassure latency for the quantized model
from bitmat import Auto158ModelForCausalLM
model = Auto158ModelForCausalLM.from_pretrained("quantized/llama-2-7b-absmean-ternary-bitmat")
model = model.to("cuda:0")
quant_latency_per_token = measure_latency(model, tokenizer, ttft=False)
quant_latency_ttft = measure_latency(model, tokenizer, ttft=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

tensor([    1,   319,  1243,   338,   263,   809, 10579,  4051, 23209, 13056,
         4051,  5015, 16033, 24626, 13305,  3237,  4956, 23393, 16325, 22322],
       device='cuda:0')
20
Latency per Token: 268.40 ms


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

Time-to-first-token latency: 65.86 ms


In [14]:
quant_latencies = {"latency_per_token": 268.40, "latency_ttft": 65.86}
quant_latencies

{'latency_per_token': 268.4, 'latency_ttft': 65.86}

In [17]:
# meassure latency for the baseline model
#del model 
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="cuda:0")

base_latency_ttft = measure_latency(model, tokenizer, ttft=True)
base_latency_per_token = measure_latency(model, tokenizer, ttft=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Time-to-first-token latency: 46.63 ms
tensor([   1,  319, 1243,  ...,  263, 1243,  310], device='cuda:0')
4096
Latency per Token: 52.90 ms


In [18]:
base_latencies = {"latency_per_token": 52.90, "latency_ttft": 46.63}
base_latencies

{'latency_per_token': 52.9, 'latency_ttft': 46.63}

In [19]:
del model