## LLAMA MODEL FROM SCARTCH

### 1. Import Dependecies

1. tocrh as matrix operations
2. torch.nn as neural network operations like linear layers, activation functions, etc.
3. torch.nn.functional as functional operations like loss functions, activation functions, etc.
4. torch.optim as optimizer functions like SGD, Adam, etc.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### 2. RMSNorm

- basically like norm layer but without mean subtraction
- ![](https://pbs.twimg.com/media/GCRiqC6aIAAAh1M.png)

In [2]:
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    
    def forward(self, x):
        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True))
        x_norm = x / (rms * self.eps)
        return x_norm

In [5]:
x = torch.rand(2,4)
norm = RMSNorm(dim=4)
output = norm(x)

print("Input:\n", x)
print("Output setelah RMSNorm:\n", output)

Input:
 tensor([[0.3969, 0.1416, 0.3920, 0.6198],
        [0.5161, 0.8649, 0.1807, 0.9737]])
Output setelah RMSNorm:
 tensor([[ 938611.4375,  334802.7500,  926952.6250, 1465494.6250],
        [ 730734.1250, 1224642.2500,  255796.4844, 1378712.1250]])


In [6]:
class RotaryEmbedding(nn.Module):
    def __init__(self, dim: int, max_seq_len: int = 2048):
        """
        Process : 
        Compute sinusiodal freqs,
        cache sin and cos
        separate embedding into pairs
        rotate with sin and cos
        combine and return

        Args:
        dim: dimension of the embedding, must be even
        max_seq_len: maximum sequence length, sentences
        """

        super().__init__()
        self.dim = dim
        self.max_seq_len = max_seq_len

        # create freq for rope
        freqs = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        position = torch.arange(max_seq_len).float()
        freqs = torch.outer(position, freqs) # (max_seq_len, dim/2)

        # cache cos and sin
        self.register_buffer('cos_cache', torch.cos(freqs))
        self.register_buffer('sin_cache', torch.sin(freqs))
    
    def forward(self, x, seq_len: int):
        # x: (batch_size, seq_len, dim)
        cos = self.cos_cache[:seq_len].view(seq_len, 1,1, self.dim//2)
        sin = self.sin_cache[:seq_len].view(seq_len, 1,1, self.dim//2)

        # reshape x for rotation
        x_reshape = x.view(*x.shape[:-1], -1, 2)
        x1, x2 = x_reshape[..., 0], x_reshape[..., 1]

        # rotate
        rotated_x1 = x1 * cos - x2 * sin
        rotated_x2 = x1 * sin + x2 * cos

        rotated_x = torch.stack([rotated_x1, rotated_x2], dim=-1)
        return rotated_x.flatten(-2)

In [7]:
batch, seq_len, heads, dim = 1, 5, 2, 8  
x = torch.randn(batch, seq_len, heads, dim)
rope = RotaryEmbedding(dim=dim, max_seq_len=10)
x_rope = rope(x, seq_len=seq_len)
print("Input shape:", x.shape)
print("Output shape setelah RoPE:", x_rope.shape)

Input shape: torch.Size([1, 5, 2, 8])
Output shape setelah RoPE: torch.Size([5, 5, 2, 8])


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SwiGlu(nn.Module):
    def __init__(self, in_features: int, hidden_features: int):
        super().__init__()
        self.gate_proj = nn.Linear(in_features, hidden_features, bias=False)
        self.up_proj = nn.Linear(in_features, hidden_features, bias=False)
        self.down_proj = nn.Linear(hidden_features, in_features, bias=False)

    def forward(self, x):
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        activated = F.silu(gate) * up
        return self.down_proj(activated)

In [25]:
# Tes SwiGLU
x = torch.randn(2, 4)
swiglu = SwiGlu(in_features=4, hidden_features=8)
output = swiglu(x)
print("Output setelah SwiGLU:\n", output)


Output setelah SwiGLU:
 tensor([[-0.0308, -0.0324,  0.0450,  0.0164],
        [ 0.0333, -0.0289, -0.0031,  0.0062]], grad_fn=<MmBackward0>)


In [28]:
# Dummy input and target
x = torch.rand(2, 4, 4)
target = torch.rand(2, 4, 4)

# Model
model = SwiGlu(in_features=4, hidden_features=8)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Loss function
criterion = nn.MSELoss()

# Training step
output = model(x)
loss = criterion(output, target)

optimizer.zero_grad()       # Clear previous gradients
loss.backward()             # Backpropagation
optimizer.step()            # Update weights

print("Loss:", loss.item())

Loss: 0.3757893741130829
