<a href="https://colab.research.google.com/github/Sekyiwaa/llm_course/blob/main/AppledLLMModule_1BatchingInference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# Generation till Completion and Batch Processing

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2

In [27]:
# generation till completion
# so now let's talk about elephant in the room
# llm are talkative, and since we are predicting the next token
# we need to account for the idea that
# 1) llm may finish it's logic or generation of text # with special token <eos>
# 2) we don't care and after specified number of new tokens generated we halt
# the generation, rude not user friendly but necessity

print(tokenizer.eos_token)
print(tokenizer.eos_token_id)

# this should be your first command running after initialization
# so basically we want to keep generating new tokens like before
# and stop generating upon reaching condition 1 or 2

<|endoftext|>
151643


In [28]:
import torch.nn.functional as F

def generate(model,tokenizer,prompt,max_new_tokens=15):
    inputs = tokenizer(prompt, return_tensors="pt")

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    generated_tokens = list()

    for step in range(max_new_tokens):
        with torch.no_grad():
            out = model(input_ids = input_ids,
                        attention_mask = attention_mask)

        logits_last = out.logits[:,-1,:]
        probs = F.softmax(logits_last,dim=-1)

        # so what we do is called greedy decoding, we will pick up the token
        # with highest probability
        next_token_id = torch.argmax(probs,dim=-1)

        token_id_int = next_token_id.item()

        # next token text
        next_token = tokenizer.decode(token_id_int)


        # ok and now pay attention at this moment your next token may say eos
        # or effectively saying I am done predicting next token
        # most important line
        if tokenizer.eos_token_id is not None and token_id_int == tokenizer.eos_token_id:
            break

        generated_tokens.append(next_token)

        # now let's update inputs, since we want to continue generations
        # with newly minted token in context

        # append to context: make shape [1,1], then cat
        next_token_2d = next_token_id.view(1, 1)        # [1, 1]

        input_ids = torch.cat([input_ids, next_token_2d], dim=1)

        attention_mask = torch.cat(
            [attention_mask, torch.ones((1, 1), dtype=attention_mask.dtype)], dim=1
        )

    continuation = "".join(generated_tokens)
    return continuation, prompt + continuation







In [29]:
prompt = "Capital of Texas is "
new_text,full = generate(model,tokenizer,prompt,25)


In [30]:
print(new_text)


100 miles from Austin, Texas. How far is it from Austin, Texas to the capital of Texas?
To determine


In [None]:
# last time you used pipeline
# well now use torch to generate input to help the user understand how next token is generated
# for each token shsow the user other 5 top tokens you skipped or didn't show, not just the best token
# for example input = hello
# output = hello 1st predicted token: world [2nd:john, 3rd:welcome, 4th:good, 5th:buy]

# Example Input = How are you doing?
# Output = 1st predicted token: Hi [2nd: 3rd: 4th: 5th: ]
# Output = 2nd predicted token: I'm [2nd: 3rd: 4th: 5th: ]
# Output = 3rd predicted token: Doing [2nd: 3rd: 4th: 5th: ]
# Output = 4th predicted token: Great [2nd: 3rd: 4th: 5th: ]
# Output = 5th predicted token: ! [2nd: 3rd: 4th: 5th: ]
# Output = 5th predicted token: <eos> [2nd: 3rd: 4th: 5th: ]


In [38]:
import torch.nn.functional as F

def generate(model, tokenizer, prompt): # Removed max_new_tokens parameter
    inputs = tokenizer(prompt, return_tensors="pt")

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    generated_tokens = list()

    print(f"Input: {prompt}")

    # Continue generation until the eos token is generated
    while True:
        with torch.no_grad():
            out = model(input_ids = input_ids,
                        attention_mask = attention_mask)

        logits_last = out.logits[:,-1,:]
        probs = F.softmax(logits_last,dim=-1)

        # Get top 5 tokens and their probabilities
        top_k_probs, top_k_indices = torch.topk(probs, k=5)

        # so what we do is called greedy decoding, we will pick up the token
        # with highest probability
        next_token_id_tensor = top_k_indices[0, 0]
        token_id_int = next_token_id_tensor.item()

        # next token text
        next_token = tokenizer.decode(token_id_int)

        # Decode top 5 tokens and get their probabilities
        top_5_tokens_with_probs = [(tokenizer.decode(idx.item()), prob.item()) for idx, prob in zip(top_k_indices[0], top_k_probs[0])]


        print(f"Step {len(generated_tokens)+1}: Predicted Token: {next_token} [Top 5: {top_5_tokens_with_probs}]")


        # ok and now pay attention at this moment your next token may say eos
        # or effectively saying I am done predicting next token
        # most important line
        if tokenizer.eos_token_id is not None and token_id_int == tokenizer.eos_token_id:
            print("<eos> token generated, stopping.")
            break

        generated_tokens.append(next_token)

        # now let's update inputs, since we want to continue generations
        # with newly minted token in context

        # append to context: make shape [1,1], then cat
        next_token_2d = next_token_id_tensor.view(1, 1)        # [1, 1]

        input_ids = torch.cat([input_ids, next_token_2d], dim=1)

        attention_mask = torch.cat(
            [attention_mask, torch.ones((1, 1), dtype=attention_mask.dtype)], dim=1
        )

    continuation = "".join(generated_tokens)
    return continuation, prompt + continuation

prompt = "Capital of Texas is "
new_text,full = generate(model,tokenizer,prompt) # Removed max_new_tokens argument
print("\nGenerated Text:", new_text)

Input: Capital of Texas is 
Step 1: Predicted Token: 1 [Top 5: [('1', 0.23321795463562012), ('2', 0.15147124230861664), ('3', 0.10503458976745605), ('5', 0.08549241721630096), ('6', 0.08106186240911484)]]
Step 2: Predicted Token: 0 [Top 5: [('0', 0.20056715607643127), ('2', 0.08730699867010117), ('1', 0.08246234804391861), ('5', 0.07479152828454971), ('6', 0.06720951944589615)]]
Step 3: Predicted Token: 0 [Top 5: [('0', 0.3742975890636444), (' miles', 0.10339178144931793), (',', 0.055691592395305634), ('5', 0.050429508090019226), ('2', 0.04000747948884964)]]
Step 4: Predicted Token:  miles [Top 5: [(' miles', 0.3537689447402954), ('0', 0.15094760060310364), (' km', 0.05416981503367424), (' years', 0.045800428837537766), (',', 0.04548215493559837)]]
Step 5: Predicted Token:  from [Top 5: [(' from', 0.21614880859851837), (' north', 0.13268116116523743), (' west', 0.13078369200229645), (' east', 0.1178579181432724), (' away', 0.06823877990245819)]]
Step 6: Predicted Token:  Austin [Top 5:

In [None]:


# Batching Light Version

prompts = [
    "The capital of Texas is Dallas or Austin?",
    "Where McAllen is located",
    "What is a transformer in LLM?"
]

inputs = tokenizer(prompts,return_tensors="pt", padding=True)
# Notice the idea of padding, we have a batch: [batch_size,max_seq_length]
# inputs = tokenizer(prompts,return_tensors="pt") this line is problematic # you can't stack them together
# because of different length
inputs["input_ids"].shape # so we have a 3 prompts which is expected
# but since we have a different length of number of tokens we are getting padding


torch.Size([3, 9])

In [None]:
inputs

{'input_ids': tensor([[   785,   6722,    315,   8257,    374,  18542,    476,  19260,     30],
        [  9064,   4483,  79877,    374,   7407, 151643, 151643, 151643, 151643],
        [  3838,    374,    264,  42578,    304,    444,  10994,     30, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [None]:
# so notice those zeros in attention mask, so basically,
# with padding = True, we are saying align our inputs
# in the way that we have a matrix, where longest tokenized prompt
# defines it's

In [None]:
tokenizer.padding_side = 'right'


In [None]:
# this brings a problem
# inputs = tokenizer(prompts,return_tensors="pt", padding=True)
# with torch.no_grad():
#     out = model.generate(**inputs)

# out.shape()

In [None]:
#Note:
# by default this one is called right padding but you can change it to left one
tokenizer.padding_side = 'left'  # Change the padding side
inputs = tokenizer(prompts, return_tensors="pt", padding=True)
inputs


{'input_ids': tensor([[   785,   6722,    315,   8257,    374,  18542,    476,  19260,     30],
        [151643, 151643, 151643, 151643,   9064,   4483,  79877,    374,   7407],
        [151643,   3838,    374,    264,  42578,    304,    444,  10994,     30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
tokenizer.padding_side = 'left'
inputs = tokenizer(prompts,return_tensors="pt", padding=True)
with torch.no_grad():
    out = model.generate(**inputs,max_new_tokens=40)

out.shape

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


torch.Size([3, 49])

In [None]:
# how long each prompt is (after left padding)
input_lengths = inputs["attention_mask"].sum(dim=1)  # tensor([L0, L1, L2])
print("input lengths:", input_lengths.tolist())

# how many new tokens each row actually got
gen_lengths = out.size(1) - input_lengths
print("generated lengths:", gen_lengths.tolist())


input lengths: [9, 5, 8]
generated lengths: [40, 44, 41]


In [None]:
# only the new text per row

continuations = []
for i in range(out.size(0)):
    L = int(input_lengths[i])              # length of the i-th prompt
    cont_ids = out[i, L:]                  # tokens generated after the prompt
    cont_ids = cont_ids.tolist()           # ensure CPU list of ints
    text = tokenizer.decode(cont_ids, skip_special_tokens=True).strip() # try to remove skip_special_tokens=True
    continuations.append(text)
    print(f"Row {i} continuation: {text}")

Row 0 continuation: Dallas
Row 1 continuation: McAllen is located, it is a city that is known for its rich history and diverse culture. The city is home to many historic sites, museums, and cultural events that showcase the city's unique heritage. McAllen
Row 2 continuation: ? A transformer is a device that converts alternating current (AC) from one voltage level to another. It is a type of electrical transformer that uses a core made of silicon steel to reduce the magnetic field strength


In [None]:
continuations

['Dallas',
 "McAllen is located, it is a city that is known for its rich history and diverse culture. The city is home to many historic sites, museums, and cultural events that showcase the city's unique heritage. McAllen",
 '? A transformer is a device that converts alternating current (AC) from one voltage level to another. It is a type of electrical transformer that uses a core made of silicon steel to reduce the magnetic field strength']