In [1]:

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:03<00:00, 10.59it/s]
Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 11.18it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:


# Convert prompt to tokens
prompt_template = """\
<|system|>
</s>
<|user|>
{prompt}</s>
<|assistant|>"""

prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

tokens = tokenizer(
    prompt_template.format(prompt=prompt),
    return_tensors='pt'
).input_ids.cuda()

In [3]:
%%time

# Generate output
generation_output = model.generate(
    tokens,
    max_new_tokens = 20,
)

print(generation_output)

tensor([[    1,   523, 28766,  6574, 28766, 28767,    13,     2, 28705,    13,
         28789, 28766,  1838, 28766, 28767,    13,  1976, 28742,   267,  6328,
           356,   272,  5439,   302,   272,  8599, 28723,   995,  2338,   624,
         13677,  6287, 28725,   624, 13677,  7635,   304,   624, 13677,  6120,
         28723,   995,   948,   582,  4668,   970,   368,  2774, 28723,  6926,
           460,   368, 28804,     2, 28705,    13, 28789, 28766,   489, 11143,
         28766, 28767,    13,  1976,   460,   438,   272,  4982,   302,   264,
          9661,   395,   264,   624, 28733, 23881, 13630, 28723,   851,   349,
          1096,   739]], device='cuda:0')
CPU times: user 722 ms, sys: 82.4 ms, total: 804 ms
Wall time: 800 ms


In [4]:
%%time

# Generate output
generation_output = model.generate(
    tokens,
    max_new_tokens = 20,
    num_beams = 2,
)

print(generation_output)

tensor([[    1,   523, 28766,  6574, 28766, 28767,    13,     2, 28705,    13,
         28789, 28766,  1838, 28766, 28767,    13,  1976, 28742,   267,  6328,
           356,   272,  5439,   302,   272,  8599, 28723,   995,  2338,   624,
         13677,  6287, 28725,   624, 13677,  7635,   304,   624, 13677,  6120,
         28723,   995,   948,   582,  4668,   970,   368,  2774, 28723,  6926,
           460,   368, 28804,     2, 28705,    13, 28789, 28766,   489, 11143,
         28766, 28767,    13,  1976,   460,   438,   272,  4982,   302,   264,
          9661,   395,   264,   624, 28733, 23881, 13630, 28723,   851,   349,
          1096,   739]], device='cuda:0')
CPU times: user 459 ms, sys: 27 ms, total: 486 ms
Wall time: 485 ms
