Simply loading the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "meta-llama/Llama-3.2-1B"   # llama 3.1 8B is a scaled up ( in terms of layers) version of this

Tokeniser

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
input_text = """ Apples are red"""
tokenized_inputs = tokenizer(input_text, return_tensors="pt")
tokenized_inputs

{'input_ids': tensor([[128000,   1883,    645,    527,   2579]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-1B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 128256
}

In [None]:
model_weights = model.state_dict()
for k, v in model_weights.items():
    print(k, v.shape)

model.embed_tokens.weight torch.Size([128256, 2048])
model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])
model.layers.0.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.v_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])
model.layers.0.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.0.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.0.mlp.down_proj.weight torch.Size([2048, 8192])
model.layers.0.input_layernorm.weight torch.Size([2048])
model.layers.0.post_attention_layernorm.weight torch.Size([2048])
model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])
model.layers.1.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.1.self_attn.v_proj.weight torch.Size([512, 2048])
model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])
model.layers.1.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.1.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.1.

Model Remaking

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, ff_dim, num_heads, kv_dim):
        super(TransformerModel, self).__init__()
        self.embed_tokens = model_weights['model.embed_tokens.weight'] # here
        self.layers = nn.ModuleList([
            TransformerLayer(embed_dim, ff_dim, num_heads, kv_dim, i)
            for i in range(num_layers)
        ])
        self.norm = model_weights['model.norm.weight'] # nn.LayerNorm(embed_dim)
        self.lm_head = model_weights['lm_head.weight'].T # nn.Linear(embed_dim, vocab_size, bias=False)
        self.eps = 1e-5

    def forward(self, sentence):
        # Embed tokens

        tokenized_output = tokenizer(sentence, return_tensors="pt")
        input_ids = tokenized_output["input_ids"]

        x = (self.embed_tokens)[input_ids]

        # Pass through each layer
        for layer in self.layers:
            x = layer(x)

        # Normalize output
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x = (x - mean) / torch.sqrt(variance + self.eps)  # Layer normalization
        x = x * self.norm  # Scale

        # Project to vocabulary size
        logits = x @ self.lm_head
        return logits


In [None]:

class TransformerLayer(nn.Module):
    def __init__(self, embed_dim, ff_dim, num_heads, kv_dim, i):
        super(TransformerLayer, self).__init__()
        self.self_attn = SelfAttention(embed_dim, num_heads, kv_dim, i)
        self.input_layernorm = model_weights[f"model.layers.{i}.input_layernorm.weight"]  # nn.LayerNorm(embed_dim)
        self.post_attention_layernorm = model_weights[f"model.layers.{i}.post_attention_layernorm.weight"] # nn.LayerNorm(embed_dim)
        self.mlp = FeedForward(embed_dim, ff_dim, i)
        self.eps = 1e-5

    def forward(self, x):

        # Self-attention block
        residual = x

        ## input layer normalization
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x = (x - mean) / torch.sqrt(variance + self.eps)
        x = self.input_layernorm * x

        x = self.self_attn(x) + residual


        # Feed-forward block
        residual = x

        ## input layer normalization
        mean = x.mean(dim=-1, keepdim=True)  # this means last dimension layer normalization
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x = (x - mean) / torch.sqrt(variance + self.eps)
        x = self.input_layernorm * x

        x = self.mlp(x) + residual
        return x

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, kv_dim, i):
        super(SelfAttention, self).__init__()
        self.num_heads = num_heads
        self.num_heads_kv = num_heads / 4
        self.head_dim = embed_dim // num_heads
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.q_proj = model_weights[f"model.layers.{i}.self_attn.q_proj.weight"] #nn.Linear(embed_dim, num_heads * head_dim)  # Projection for Q
        self.k_proj = model_weights[f"model.layers.{i}.self_attn.k_proj.weight"].T #nn.Linear(embed_dim, num_heads_kv * head_dim)  # Projection for K
        self.v_proj = model_weights[f"model.layers.{i}.self_attn.v_proj.weight"].T #nn.Linear(embed_dim, num_heads_kv * head_dim)  # Projection for V
        self.o_proj = model_weights[f"model.layers.{i}.self_attn.o_proj.weight"] #nn.Linear(embed_dim, num_heads * head_dim)  # Output projection

    def forward(self, x):
        bsz, seq_len, embed_dim = x.size()


        # Project Q, K, and V
        q = torch.matmul(x, self.q_proj)  # (bsz, seq_len, num_heads * head_dim)
        k = torch.matmul(x, self.k_proj)  # (bsz, seq_len, num_heads_kv * head_dim)
        v = torch.matmul(x, self.v_proj)  # (bsz, seq_len, num_heads_kv * head_dim)

        # Transpose to get (batch_size, num_heads, seq_len, head_dim)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_output = attn_weights @ v

        # Combine heads and pass through output projection
        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, embed_dim)
        return torch.matmul(attn_output, self.o_proj)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, i):
        super(FeedForward, self).__init__()
        self.gate_proj = model_weights[f"model.layers.{i}.mlp.gate_proj.weight"].T #nn.Linear(embed_dim, ff_dim)
        self.up_proj = model_weights[f"model.layers.{i}.mlp.up_proj.weight"].T #nn.Linear(embed_dim, ff_dim)
        self.down_proj = model_weights[f"model.layers.{i}.mlp.down_proj.weight"].T #nn.Linear(ff_dim, embed_dim)

    def forward(self, x):
        # Gated Linear Unit (GLU)
        gate_x = F.gelu(x @ self.gate_proj)
        up_x = x @ self.up_proj
        x = gate_x * up_x
        x = x @ self.down_proj
        return x




In [None]:
# Define model parameters based on layer structure
vocab_size = 128256
embed_dim = 2048
num_layers = 16
ff_dim = 8192
num_heads = 8  # Based on 2048 / 8 = 256 per head
kv_dim = 512  # Separate projection sizes for k and v

# Initialize model
model = TransformerModel(vocab_size, embed_dim, num_layers, ff_dim, num_heads, kv_dim)

# Example
sentence = "apples are "
logits = model.forward(sentence)
print("Logits shape:", logits.shape)  # Expected: (batch_size, seq_len, vocab_size)


token_ids = torch.argmax(logits, dim=-1)  # Shape: [batch_size, seq_len]
decoded_tokens = [tokenizer.decode(ids, skip_special_tokens=True) for ids in token_ids]

print(f"Token IDs: {token_ids}")
print(f"Decoded Tokens: {decoded_tokens}")

Logits shape: torch.Size([1, 5, 128256])
Token IDs: tensor([[17429, 35178,  8350,  6641,   384]], device='cuda:0')
Decoded Tokens: [' nursullafordRED e']
