## LLAMA MODEL FROM SCARTCH

### 1. Import Dependecies

1. tocrh as matrix operations
2. torch.nn as neural network operations like linear layers, activation functions, etc.
3. torch.nn.functional as functional operations like loss functions, activation functions, etc.
4. torch.optim as optimizer functions like SGD, Adam, etc.

In [29]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### 2. RMSNorm

- basically like norm layer but without mean subtraction
- ![](https://pbs.twimg.com/media/GCRiqC6aIAAAh1M.png)

In [2]:
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    
    def forward(self, x):
        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True))
        x_norm = x / (rms * self.eps)
        return x_norm

In [5]:
x = torch.rand(2,4)
norm = RMSNorm(dim=4)
output = norm(x)

print("Input:\n", x)
print("Output setelah RMSNorm:\n", output)

Input:
 tensor([[0.3969, 0.1416, 0.3920, 0.6198],
        [0.5161, 0.8649, 0.1807, 0.9737]])
Output setelah RMSNorm:
 tensor([[ 938611.4375,  334802.7500,  926952.6250, 1465494.6250],
        [ 730734.1250, 1224642.2500,  255796.4844, 1378712.1250]])


### Rotary Position Embedding (RoPE)

Process : 
1. Compute sinusiodal freqs,
2. cache sin and cos
3. separate embedding into pairs
4. rotate with sin and cos
5. combine and return

In [40]:
import torch
import torch.nn as nn
import math

class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048):
        super().__init__()
        self.dim = dim
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        t = torch.arange(max_position_embeddings, dtype=torch.float)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)  # (seq_len, dim)

        self.cos_cached = emb.cos()[None, :, None, :]
        self.sin_cached = emb.sin()[None, :, None, :]

    def forward(self, x):
        # x shape: (seq_len, batch, head, dim)
        seq_len = x.size(0)
        cos = self.cos_cached[:, :seq_len, :, :].to(x.device)  # (1, seq_len, 1, dim)
        sin = self.sin_cached[:, :seq_len, :, :].to(x.device)  # (1, seq_len, 1, dim)

        # (seq_len, batch, head, dim)
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
        x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
        return x_rotated

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config["hidden_size"]
        self.num_heads = config["num_attention_heads"]
        self.head_dim = self.hidden_size // self.num_heads

        self.qkv_proj = nn.Linear(self.hidden_size, 3 * self.hidden_size)
        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size)

        self.rotary_emb = RotaryEmbedding(self.head_dim)

    def forward(self, hidden_states, attention_mask=None, cache=None):
        batch_size, seq_len, _ = hidden_states.size()

        qkv = self.qkv_proj(hidden_states)  # (batch, seq_len, 3 * hidden_size)
        qkv = qkv.view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        query, key, value = qkv.unbind(dim=2)  # masing-masing (batch, seq_len, num_heads, head_dim)

        # Transpose untuk RoPE
        query = query.transpose(0, 1)  # (seq_len, batch, num_heads, head_dim)
        key = key.transpose(0, 1)

        # Apply Rotary Positional Embedding
        query = self.rotary_emb(query)
        key = self.rotary_emb(key)

        # Transpose kembali
        query = query.transpose(0, 1)  # (batch, seq_len, num_heads, head_dim)
        key = key.transpose(0, 1)

        # Attention score computation
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            scores = scores + attention_mask

        attn_probs = torch.nn.functional.softmax(scores, dim=-1)
        context = torch.matmul(attn_probs, value)  # (batch, seq_len, num_heads, head_dim)

        context = context.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
        output = self.out_proj(context)
        return output

config = {
    "hidden_size": 256,
    "num_attention_heads": 8,
}
mha = MultiHeadAttention(config)
input_tensor = torch.randn(2, 5, config["hidden_size"])  # (batch, seq_len, hidden)
output = mha(input_tensor)
print("Output shape:", output.shape)  # seharusnya (2, 5, 256)


RuntimeError: The size of tensor a (16) must match the size of tensor b (32) at non-singleton dimension 3

In [35]:
batch, seq_len, heads, dim = 1, 5, 2, 8  
x = torch.randn(batch, seq_len, heads, dim)
rope = RotaryEmbedding(dim=dim, max_seq_len=10)
x_rope = rope(x, seq_len=seq_len)
print("Input shape:", x.shape)
print("Output shape setelah RoPE:", x_rope.shape)

TypeError: RotaryEmbedding.__init__() got an unexpected keyword argument 'max_seq_len'

### Swiglu Activation Function

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SwiGlu(nn.Module):
    def __init__(self, in_features: int, hidden_features: int):
        super().__init__()
        self.gate_proj = nn.Linear(in_features, hidden_features, bias=False)
        self.up_proj = nn.Linear(in_features, hidden_features, bias=False)
        self.down_proj = nn.Linear(hidden_features, in_features, bias=False)

    def forward(self, x):
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        activated = F.silu(gate) * up
        return self.down_proj(activated)

In [25]:
# Tes SwiGLU
x = torch.randn(2, 4)
swiglu = SwiGlu(in_features=4, hidden_features=8)
output = swiglu(x)
print("Output setelah SwiGLU:\n", output)


Output setelah SwiGLU:
 tensor([[-0.0308, -0.0324,  0.0450,  0.0164],
        [ 0.0333, -0.0289, -0.0031,  0.0062]], grad_fn=<MmBackward0>)


for training and inference the parameter, we can do this

```python
# Dummy input and target
x = torch.rand(2, 4, 4)
target = torch.rand(2, 4, 4)

# Model
model = SwiGlu(in_features=4, hidden_features=8)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Loss function
criterion = nn.MSELoss()

# Training step
output = model(x)
loss = criterion(output, target)

optimizer.zero_grad()       # Clear previous gradients
loss.backward()             # Backpropagation
optimizer.step()            # Update weights

print("Loss:", loss.item())

```

### MultiHead Self Attention

In [36]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config["hidden_size"]
        self.num_heads = config["num_attention_heads"]
        self.head_dim = self.hidden_size // self.num_heads

        self.qkv_proj = nn.Linear(self.hidden_size, 3 * self.hidden_size)
        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size)

        self.rotary_emb = RotaryEmbedding(self.head_dim)

    def forward(self, hidden_states, attention_mask=None, cache=None):
        batch_size, seq_len, _ = hidden_states.size()

        qkv = self.qkv_proj(hidden_states)  # (batch, seq_len, 3 * hidden_size)
        qkv = qkv.view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        query, key, value = qkv.unbind(dim=2)  # masing-masing (batch, seq_len, num_heads, head_dim)

        # Transpose untuk RoPE
        query = query.transpose(0, 1)  # (seq_len, batch, num_heads, head_dim)
        key = key.transpose(0, 1)

        # Apply Rotary Positional Embedding
        query = self.rotary_emb(query)
        key = self.rotary_emb(key)

        # Transpose kembali
        query = query.transpose(0, 1)  # (batch, seq_len, num_heads, head_dim)
        key = key.transpose(0, 1)

        # Attention score computation
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            scores = scores + attention_mask

        attn_probs = torch.nn.functional.softmax(scores, dim=-1)
        context = torch.matmul(attn_probs, value)  # (batch, seq_len, num_heads, head_dim)

        context = context.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
        output = self.out_proj(context)
        return output


In [38]:
config = {
    "hidden_size": 256,
    "num_attention_heads": 8,
}
mha = MultiHeadAttention(config)
input_tensor = torch.randn(2, 5, config["hidden_size"])  # (batch, seq_len, hidden)
output = mha(input_tensor)
print("Output shape:", output.shape)  # seharusnya (2, 5, 256)


RuntimeError: The size of tensor a (16) must match the size of tensor b (32) at non-singleton dimension 3