In [3]:
from IPython.display import Markdown

def display_chat(messages):
    """
    Displays messages in a Jupyter Notebook using Markdown formatting.
    Different roles ('system', 'user', 'assistant') are styled differently.
    """

    markdown_output = ""

    for message in messages:
        role = message.get('role')
        content = message.get('content').replace('\n', '  \n')
        if role is None or content is None:
            raise ValueError("Each message must have 'role' and 'content'.")
        if role == 'system':
            markdown_output += f"**System prompt:** {content}\n\n"
        elif role == 'user':
            markdown_output += f"👤: {content}\n\n"
        elif role == 'assistant':
            markdown_output += f"🤖: {content}\n\n"
        else:
            markdown_output += f"Unrecognized role:{role}\n\n"

    # Display formatted markdown
    display(Markdown(markdown_output))

# Model inference with HuggingFace transformers 

We will load a certain model from HuggingFace and run inference on it.

The example models below are all finetuned versions of the first one, Mistral 7B. Which is a pretrained model.

## Load the model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "monology/openinstruct-mistral-7b"
model_name = "openchat/openchat_3.5"
#model_name = "HuggingFaceH4/zephyr-7b-beta"
# If the type is not specified to bfloat16, the model is loaded in float32 and takes twice the memory
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="cuda", torch_dtype=torch.bfloat16
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:21<00:00, 10.71s/it]


#### The model we got is "just" a regular pytorch nn.Model with a bunch of layers. 

In [86]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

## Prepare inputs for the model using the tokenizer

LLMs expect "tokens" as inputs.
A token is the number representation of a group of characters according to a "vocabulary".
Might be a letter, symbol, word, emoji...

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab = tokenizer.get_vocab()
display(Markdown(f"### Vocabulary size: {len(vocab)}")) 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Vocabulary size: 32002

In [64]:
display(Markdown("## First tokens in the vocabulary"))
print("\n".join([f"{k}: {v}" for k,v in dict(sorted(vocab.items(), key=lambda k: k[1])).items()][:4]))
display(Markdown("## Tokens for the first letters"))
print("\n".join([f"{k}: {v}" for k,v in dict(sorted(vocab.items())).items() if len(k)==1 and k in "abcdef"]))

## First tokens in the vocabulary

<unk>: 0
<s>: 1
</s>: 2
<0x00>: 3


## Tokens for the first letters

a: 28708
b: 28726
c: 28717
d: 28715
e: 28706
f: 28722


In [82]:
display(Markdown("### **Original message**:"))
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": """Translate 'The lazy dog' to spanish."""},
]
display_chat(messages)

### **Original message**:

**System prompt:** You are a helpful assistant.

👤: Translate 'The lazy dog' to spanish.



In [83]:
display(Markdown("## Some message is translated into a list of tokens"))
display(Markdown("### **Raw text**"))
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
print(inputs)

## Some message is translated into a list of tokens

### **Raw text**

<s>GPT4 Correct System: You are a helpful assistant.<|end_of_turn|>GPT4 Correct User: Translate 'The lazy dog' to spanish.<|end_of_turn|>GPT4 Correct Assistant:


In [84]:
display(Markdown("### **Tokens for the model**:"))
inputs = (
    tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
    )
    .to("cuda")
)
print(inputs[0])
print([tokenizer.decode(s) for s in inputs[0]])

### **Tokens for the model**:

tensor([    1,   420,  6316, 28781,  3198,  3123,  2135, 28747,   995,   460,
          264, 10865, 13892, 28723, 32000,   420,  6316, 28781,  3198,  3123,
         1247, 28747,  4335, 10020,   464,  1014, 17898,  3914, 28742,   298,
        12363,   789, 28723, 32000,   420,  6316, 28781,  3198,  3123, 21631,
        28747], device='cuda:0')
['<s>', 'G', 'PT', '4', 'Cor', 'rect', 'System', ':', 'You', 'are', 'a', 'helpful', 'assistant', '.', '<|end_of_turn|>', 'G', 'PT', '4', 'Cor', 'rect', 'User', ':', 'Trans', 'late', "'", 'The', 'lazy', 'dog', "'", 'to', 'span', 'ish', '.', '<|end_of_turn|>', 'G', 'PT', '4', 'Cor', 'rect', 'Assistant', ':']


In [85]:
response_tokens = model.generate(inputs)[0]
response = tokenizer.decode(response_tokens[inputs.shape[-1] :], skip_special_tokens=True)
messages.append({"role": "assistant", "content": response})
display_chat(messages)

**System prompt:** You are a helpful assistant.

👤: Translate 'The lazy dog' to spanish.

🤖: The translation of 'The lazy dog' to Spanish is 'El perro perezoso'.

