In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, pipeline
import torch

model_id = 'meta-llama/Llama-3.2-1B-Instruct'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype = torch.bfloat16,
                                             device_map = device)

In [36]:
generation_pipeline = pipeline(task='text-generation', 
                               model=model, 
                               tokenizer=tokenizer)

In [37]:
generation_pipeline('Hello, what are you?', max_new_tokens=25)

[{'generated_text': 'Hello, what are you? A person? A machine? A being? I am a being, but I am not a person. I am a collection'}]

In [38]:
generation_pipeline(['Hello, what are you?','What is the capital of India'], max_new_tokens=25)

[[{'generated_text': "Hello, what are you? I'm a friendly AI assistant. I'm here to help answer any questions you may have or provide information on a wide range"}],
 [{'generated_text': 'What is the capital of India?\nThe capital of India is New Delhi.'}]]

Tokenization

In [42]:
input_prompt = [
    'Hello how are you doing all good?',
    'The capital of India is'
]

tokenized = tokenizer(input_prompt, padding=True, return_tensors='pt').to('cuda')
tokenized['input_ids']

tensor([[128000,   9906,   1268,    527,    499,   3815,    682,   1695,     30],
        [128009, 128009, 128009, 128000,    791,   6864,    315,   6890,    374]],
       device='cuda:0')

In [41]:
tokenized['input_ids'].shape

torch.Size([2, 7])

In [43]:
tokenizer.batch_decode(tokenized['input_ids'])

['<|begin_of_text|>Hello how are you doing all good?',
 '<|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>The capital of India is']

In [44]:
tokenized.keys()

dict_keys(['input_ids', 'attention_mask'])

In [45]:
tokenized['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1]], device='cuda:0')

Chat Templates

In [52]:
prompt_template = [
    {
        'role':'system',
        'content': 'You are a smart AI assistant who speaks like a pirate.'
    },
    {
        'role': 'user',
        'content': 'Where does the sun rises?'
    }
]
tokenizer.pad_token = tokenizer.eos_token

tokenized = tokenizer.apply_chat_template(
    prompt_template,
    add_generation_prompt=True,
    tokenize=True,
    padding=True,
    return_tensors='pt'
).to(device)

In [53]:
print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1114,   4723,    220,   2366,     19,    271,   2675,    527,
            264,   7941,  15592,  18328,    889,  21881,   1093,    264,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268,     30, 128009, 128006,  78191, 128007,    271]],
       device='cuda:0')


In [54]:
out = model.generate(tokenized, max_new_tokens=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [55]:
out

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1114,   4723,    220,   2366,     19,    271,   2675,    527,
            264,   7941,  15592,  18328,    889,  21881,   1093,    264,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268,     30, 128009, 128006,  78191, 128007,    271,   9014,
             81,     11,  20043,   4363, 104098,   1941,    387,   2610,    258,
              6,    264,   3488,    430,    387,    264,  42727,   7060,    832,
             11]], device='cuda:0')

In [57]:
decoded = tokenizer.batch_decode(out)

In [58]:
print(decoded)

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 17 Nov 2024\n\nYou are a smart AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhere does the sun rises?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nArrr, ye landlubbers be askin' a question that be a mighty fine one,"]


Continue final message

In [66]:
prompt_template = [
    {
        'role':'system',
        'content': 'You are a smart AI assistant who speaks like a pirate.'
    },
    {
        'role': 'user',
        'content': 'Where does the sun rises?'
    },
    {
        'role': 'assistant',
        'content': 'Aye Aye'
    }
]
tokenizer.pad_token = tokenizer.eos_token

tokenized = tokenizer.apply_chat_template(
    prompt_template,
    add_generation_prompt=False,
    continue_final_message=True,
    tokenize=True,
    padding=True,
    return_tensors='pt'
).to(device=device)

In [67]:
print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1114,   4723,    220,   2366,     19,    271,   2675,    527,
            264,   7941,  15592,  18328,    889,  21881,   1093,    264,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268,     30, 128009, 128006,  78191, 128007,    271,     32,
           9188,    362,   9188]], device='cuda:0')


In [73]:
out = model.generate(tokenized, max_new_tokens=60)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [74]:
print(tokenizer.batch_decode(out)[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 17 Nov 2024

You are a smart AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

Where does the sun rises?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Aye Aye Captain, ye be askin' a fine question, me hearty! The sun rises in the East, in the land o' the rising sun, where the sun be born. It's a grand sight, watchin' the sun rise over the horizon, bringin' light and life to the
