In [177]:
#npip install accelerate

In [178]:
# pip install transformers

In [179]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Loading the LLM

### first load the model and its tokenizer.

For that we will first import the classes:


AutoModelForCausalLM and AutoTokenizer. When we want to process a
sentence, we can apply the tokenizer first and then the model in two
separate steps. Or we can create a pipeline object that wraps the two
steps and then apply the pipeline to the sentence.

This is why you'll also import the pipeline class.



The transformers library has two types of model classes:
AutoModelForCausalLM and AutoModelForMaskedLM.

Causal language models represent the decoder-only models that are used
for text generation.


They are described as causal, because to predict the next token, the
model can only attend to the preceding left tokens.


Masked language models represent the encoder-only models that are used
for rich text representation.


They are described as masked, because they are trained to predict a
masked or hidden token in a sequence.


In [180]:
# import the required classes
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [181]:
import torch

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [182]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("models/microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained(
    "models/microsoft/Phi-3-mini-4k-instruct",
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 108.67it/s]


In [184]:
# Create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    # False means to not include the prompt text in the returned text
    max_new_tokens=50,
    do_sample=False,
    # no randomness in the generated text
)

Device set to use cpu


In [185]:
from transformers import pipeline

generator = pipeline("text-generation", model="models/microsoft/Phi-3-mini-4k-instruct")

out = generator(
    "Write an email apologizing to Sarah for the tragic gardening mishap.",
    max_new_tokens=60,
    do_sample=False,
)

print(out[0]["generated_text"])

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 222.22it/s]
Device set to use mps:0


Write an email apologizing to Sarah for the tragic gardening mishap.


Subject: My Deepest Apologies for the Gardening Mishap


Dear Sarah,


I hope this message finds you in good spirits despite the unfortunate events that have unfolded in our shared garden. I am writing to express my sincerest


### **Model Architecture**

In [186]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLUActivation()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_fea

In [187]:
# Get model output - use the full model, not just model.model
# This handles the cache properly
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(input_ids, use_cache=False)

# Get logits (output before softmax)
logits = outputs.logits
print(f"\nLogits shape: {logits.shape}")


Logits shape: torch.Size([1, 5, 32064])


In [188]:
# Get the last token's logits (predictions for next token)
last_token_logits = logits[0, -1, :]
print(f"Last token logits shape: {last_token_logits.shape}")

Last token logits shape: torch.Size([32064])


In [189]:
model.model.embed_tokens

Embedding(32064, 3072, padding_idx=32000)

In [190]:
model.model

Phi3Model(
  (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
  (embed_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-31): 32 x Phi3DecoderLayer(
      (self_attn): Phi3Attention(
        (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
        (rotary_emb): Phi3RotaryEmbedding()
      )
      (mlp): Phi3MLP(
        (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
        (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
        (activation_fn): SiLUActivation()
      )
      (input_layernorm): Phi3RMSNorm()
      (resid_attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      (post_attention_layernorm): Phi3RMSNorm()
    )
  )
  (norm): Phi3RMSNorm()
)

In [191]:
model.model.layers[0]

Phi3DecoderLayer(
  (self_attn): Phi3Attention(
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (mlp): Phi3MLP(
    (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (activation_fn): SiLUActivation()
  )
  (input_layernorm): Phi3RMSNorm()
  (resid_attn_dropout): Dropout(p=0.0, inplace=False)
  (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
  (post_attention_layernorm): Phi3RMSNorm()
)

In [164]:
model.model.embed_tokens

Embedding(32064, 3072, padding_idx=32000)

## Generating a Single Token to a Prompt


earlier used the Pipeline object to generate a text response to a prompt. The pipeline provides an abstraction to the underlying process of text generation. Each token in the text is actually generated one by one.

Let's now give the model a prompt and check the first token it will generate.


In [203]:
prompt = "The capital of France is"

In [193]:
# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input IDs: {input_ids}")
print(f"Input shape: {input_ids.shape}")

Input IDs: tensor([[ 450, 7483,  310, 3444,  338]])
Input shape: torch.Size([1, 5])


In [194]:
# Get model output - use the full model, not just model.model
# This handles the cache properly
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(input_ids, use_cache=False)

In [195]:
# Get the last token's logits (predictions for next token)
last_token_logits = logits[0, -1, :]
print(f"Last token logits shape: {last_token_logits.shape}")

Last token logits shape: torch.Size([32064])


In [196]:
# Get the token ID with highest probability
next_token_id = last_token_logits.argmax(-1)
print(f"\nNext token ID: {next_token_id}")

# Decode the token
next_token = tokenizer.decode(next_token_id)
print(f"Next token: '{next_token}'")

# Show the complete sequence
print(f"\nComplete: {prompt}{next_token}")


Next token ID: 3681
Next token: 'Paris'

Complete: The capital of France isParis


In [197]:
# Access model layers manually
print("\n--- Accessing Model Layers Manually ---")

with torch.no_grad():
    # Get hidden states from the base model
    model_output = model.model(
        input_ids,
        use_cache=False,
        return_dict=True
    )

# Last hidden state before lm_head
hidden_states = model_output.last_hidden_state
print(f"Hidden states shape: {hidden_states.shape}")

# Apply lm_head manually
lm_head_output = model.lm_head(hidden_states)
print(f"LM head output shape: {lm_head_output.shape}")

# Get next token (should match previous result)
token_id = lm_head_output[0, -1].argmax(-1)
print(f"Next token (manual): '{tokenizer.decode(token_id)}'")


--- Accessing Model Layers Manually ---
Hidden states shape: torch.Size([1, 5, 3072])
LM head output shape: torch.Size([1, 5, 32064])
Next token (manual): 'Paris'


In [198]:
# Get the shape the output the model before the lm_head
model_output[0].shape

torch.Size([1, 5, 3072])

The first number represents the batch size, which is 1 in this case since we have one prompt. The second number 5 represents the number of tokens. And finally 3072 represents the embedding size (the size of the vector that corresponds to each token).

Let's now get the output of the LM head.

In [199]:
# Get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])

In [200]:
lm_head_output.shape

torch.Size([1, 5, 32064])

The LM head outputs for each token in the input prompt, a vector of size 32064 (vocabulary size).

So there are 5 vectors, each of size 32064. Each vector can be mapped to a probability distribution,
that shows the probability for each token in the vocabulary to come after the given token in the input prompt.

Since we're interested in generating the output token that comes after the last token in the input prompt ("is"), we'll focus on the last vector.

So in the next cell, lm_head_output[0,-1] is a vector of size 32064 from which you can generate the token that come
after ("is").

we can do that by finding the id of the token that corresponds to the highest value in the vector
lm_head_output[0,-1] (using argmax(-1), -1 means across the last axis here).


In [201]:
token_id = lm_head_output[0,-1].argmax(-1)
token_id

tensor(3681)

In [204]:
tokenizer.decode(token_id)

'Paris'