In [None]:
import numpy as np

# =================== Hyperparams ====================
np.random.seed(42)
vocab = {}
reverse_vocab = []
vocab_size = 1000
embedding_dim = 32
max_len = 16


# =================== Tokenizer ====================
def simple_tokenize(text):
    return text.lower().replace(".", "").replace(",", "").split()

def build_vocab(sentences):
    global vocab, reverse_vocab
    words = set()
    for s in sentences:
        words.update(simple_tokenize(s))
    vocab = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[MASK]": 3}
    for i, w in enumerate(sorted(words), start=4):
        vocab[w] = i
    reverse_vocab = [k for k, v in sorted(vocab.items(), key=lambda x: x[1])]

def encode(s1, s2):
    ids = [vocab["[CLS]"]] + [vocab[w] for w in simple_tokenize(s1)] + [vocab["[SEP]"]]
    ids += [vocab[w] for w in simple_tokenize(s2)] + [vocab["[SEP]"]]
    ids += [vocab["[PAD]"]] * (max_len - len(ids))
    return np.array(ids[:max_len])


# =================== Positional Encoding ====================
def positional_encoding(max_len, dim):
    pos = np.arange(max_len)[:, None]
    i = np.arange(dim)[None, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(dim))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return angle_rads


# =================== Layer Norm ====================
def norm(x, eps=1e-6):
    mean = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + eps)


# =================== Attention ====================
def attention(q, k, v):
    dk = q.shape[-1]
    scores = q @ k.T / np.sqrt(dk)
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    return weights @ v


# =================== Multi-Head Attention ====================
class MiniMHA:
    def __init__(self, dim):
        self.Wq = np.random.randn(dim, dim) * 0.01
        self.Wk = np.random.randn(dim, dim) * 0.01
        self.Wv = np.random.randn(dim, dim) * 0.01
        self.Wo = np.random.randn(dim, dim) * 0.01

    def forward(self, x):
        q = x @ self.Wq
        k = x @ self.Wk
        v = x @ self.Wv
        attn = attention(q, k, v)
        out = attn @ self.Wo
        return out


# =================== Feed Forward ====================
class MiniFFN:
    def __init__(self, dim):
        self.W1 = np.random.randn(dim, dim * 2) * 0.01
        self.b1 = np.zeros(dim * 2)
        self.W2 = np.random.randn(dim * 2, dim) * 0.01
        self.b2 = np.zeros(dim)

    def forward(self, x):
        h = np.maximum(0, x @ self.W1 + self.b1)
        return h @ self.W2 + self.b2


# =================== Mini BERT Encoder ====================
class MiniBERTEncoder:
    def __init__(self, vocab_size, dim, max_len):
        self.embedding = np.random.randn(vocab_size, dim) * 0.01
        self.pos_encoding = positional_encoding(max_len, dim)
        self.mha = MiniMHA(dim)
        self.ffn = MiniFFN(dim)

    def forward(self, input_ids):
        x = self.embedding[input_ids] + self.pos_encoding[:len(input_ids)]
        attn_out = self.mha.forward(x)
        x = norm(x + attn_out)
        ffn_out = self.ffn.forward(x)
        x = norm(x + ffn_out)
        return x


# =================== NSP Head ====================
class NSPHead:
    def __init__(self, dim):
        self.W = np.random.randn(dim, 2) * 0.01
        self.b = np.zeros(2)

    def forward(self, cls_vec):
        logits = cls_vec @ self.W + self.b
        probs = np.exp(logits) / np.sum(np.exp(logits))
        return probs


# =================== Full BERT NSP Model ====================
class MiniBERTForNSP:
    def __init__(self, vocab_size, dim, max_len):
        self.encoder = MiniBERTEncoder(vocab_size, dim, max_len)
        self.nsp = NSPHead(dim)

    def forward(self, input_ids):
        x = self.encoder.forward(input_ids)
        cls_vec = x[0]
        return self.nsp.forward(cls_vec)


# =================== Sample Sentences & Testing ====================
sentences = [
    "Kucing tidur di atas sofa",
    "Anjing bermain di taman",
    "Langit berwarna biru hari ini",
    "Dia sedang belajar matematika"
]

build_vocab(sentences)

# Pasangan yang benar (IsNext)
s1 = "Kucing tidur di atas sofa"
s2 = "Langit berwarna biru hari ini"

# Pasangan acak (NotNext)
s3 = "Kucing tidur di atas sofa"
s4 = "Dia sedang belajar matematika"

input_ids_true = encode(s1, s2)
input_ids_false = encode(s3, s4)

model = MiniBERTForNSP(vocab_size=len(vocab), dim=embedding_dim, max_len=max_len)

print("Input TRUE:", s1, " + ", s2)
print("NSP Output Prob (IsNext vs NotNext):", model.forward(input_ids_true))

print("\nInput FALSE:", s3, " + ", s4)
print("NSP Output Prob (IsNext vs NotNext):", model.forward(input_ids_false))


In [None]:
tokens = set()
for s1, s2, _ in dataset:
    tokens.update(s1.lower().split())
    tokens.update(s2.lower().split())

tokens = sorted(list(tokens))
vocab = {w: i+2 for i, w in enumerate(tokens)}  # 0 = PAD, 1 = CLS
vocab["[PAD]"] = 0
vocab["[CLS]"] = 1
vocab["[SEP]"] = len(vocab)


In [None]:
s1 = "Aku pergi ke toko".lower().split()
s2 = "Lalu aku membeli roti".lower().split()

input_ids = [vocab["[CLS]"]] + [vocab[w] for w in s1] + [vocab["[SEP]"]] + [vocab[w] for w in s2] + [vocab["[SEP]"]]
max_len = 16

if len(input_ids) < max_len:
    input_ids += [vocab["[PAD]"]] * (max_len - len(input_ids))
else:
    input_ids = input_ids[:max_len]

input_ids = np.array(input_ids)


In [None]:
# Kita pakai Mini-BERT yang sebelumnya sudah kita bangun
hidden_states = mini_bert_forward(input_ids)

# Ambil output vector dari token [CLS] (posisi pertama)
cls_vector = hidden_states[0]


In [None]:
hidden_dim = cls_vector.shape[0]  # misal 32 atau 64, tergantung BERT

W = np.random.randn(hidden_dim, 2) * 0.01
b = np.zeros(2)

logits = cls_vector @ W + b

# Softmax
probs = np.exp(logits) / np.sum(np.exp(logits))
pred = np.argmax(probs)


In [None]:
for i in range(d_model):
    token_result = []
    print()
    print(f"Token {i}: {x[i]}")
    for j in range(0, d_model, 2):
        xx = x[i][j]
        yy = x[i][j + 1]
        print(f"Before Pos : (x,y) ({j}, {j+1}): ({xx}, {yy})")

        m = j
        wi = 1/10000**(2*i/d_model)
        thetha = m*wi

        xx_r = xx*np.cos(thetha) - yy*np.sin(thetha)
        yy_r = yy*np.sin(thetha) + yy*np.cos(thetha)

        print(f"After Pos : (x,y) ({j}, {j+1}): ({xx_r}, {yy_r})")

        # Store the results
        result[i][j] = xx_r
        if j + 1 < d_model: 
            result[i][j + 1] = yy_r
                
        token_result.extend([xx_r, yy_r])
        
    print(f"Token {i} with positional encoding: {result[i]}")


In [None]:
# Format: (kalimat, label) — label 1 = positif, 0 = negatif
data = [
    ("aku suka banget sama film ini", 1),
    ("filmnya bener-bener membosankan", 0),
    ("aktingnya luar biasa", 1),
    ("ngantuk banget pas nonton", 0),
    ("ceritanya bikin terharu", 1),
    ("gak masuk akal dan jelek", 0),
]


In [None]:
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    sentence_ids = input[b]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print("Kalimat:", " ".join(sentence_words))
    print("Token IDs:", sentence_ids)
    
    for h in range(num_heads):
        print(f"\n  Head {h}:")
        for q in range(seq_len):
            q_word = id2w.get(sentence_ids[q], "[UNK]")
            print(f"    Query Token {q:2d} [{q_word:<10}]")

            for k in range(seq_len):
                k_word = id2w.get(sentence_ids[k], "[UNK]")
                score = scores[b, h, q, k]
                weight = att_weights[b, h, q, k]
                print(f"      ↳ Key Token {k:2d} [{k_word:<10}] | Score: {score:>7.4f} | Softmax: {weight:>7.4f}")

In [None]:
# 2 contoh kalimat + label (1 = positif, 0 = negatif)
inputs = np.array([
    [1, 2, 3, 4, 5, 0, 0, 0, 0, 0],  # "aku suka banget film ini"
    [1, 6, 7, 8, 9, 0, 0, 0, 0, 0]   # "aku benci banget endingnya"
])
labels = np.array([1, 0])


np.random.seed(42)
vocab_size = 20
embedding_dim = 8
head_dim = 4
num_heads = 2
ff_hidden = 32
max_len = 10

# Word Embedding
W_embed = np.random.randn(vocab_size, embedding_dim)

# Positional Encoding
pos_embed = np.random.randn(max_len, embedding_dim)

# MHA Proj
W_q = np.random.randn(num_heads, embedding_dim, head_dim)
W_k = np.random.randn(num_heads, embedding_dim, head_dim)
W_v = np.random.randn(num_heads, embedding_dim, head_dim)
W_o = np.random.randn(num_heads * head_dim, embedding_dim)

# FFN
W1 = np.random.randn(embedding_dim, ff_hidden)
b1 = np.zeros(ff_hidden)
W2 = np.random.randn(ff_hidden, embedding_dim)
b2 = np.zeros(embedding_dim)

# Classifier
W_cls = np.random.randn(embedding_dim, 1)
b_cls = 0



## Fine Tuning Sentiment Analysis

1. data -> sentence and label
2. vocab : add cls and pad in index 1 and 0
3. tokenize : tokens the sentence and add cls and pad
4. input numeric and labels : sentence, token numeric, dim token and labels
5. embedding : seq_len, dim_token
6. positional encoding : seq_len, dim_token
7. pe = positional encoding + embedding
8. multihead attention : batch_size, seq_len, dim_token, num_heads, head_dim
9. feed forward : batch_size, seq_len, dim_token, dim_ffn
10. logits : batch_size, seq_len, num_classes
11. loss : cross entropy
12. update parameter : gradient descent



In [None]:
lr = 1e-2

for i in range(2):  # dua data
    x = inputs[i]        # input token
    y = labels[i]        # label asli

    # ----- EMBEDDING -----
    embed = W_embed[x]                       # (10, 8)
    x_embed = embed + pos_embed              # (10, 8)

    # ----- MHA -----
    Q = np.einsum('ij,hjk->hik', x_embed, W_q)
    K = np.einsum('ij,hjk->hik', x_embed, W_k)
    V = np.einsum('ij,hjk->hik', x_embed, W_v)
    att_scores = np.einsum('hij,hkj->hik', Q, K) / np.sqrt(head_dim)
    att_weights = np.exp(att_scores - np.max(att_scores, axis=-1, keepdims=True))
    att_weights /= np.sum(att_weights, axis=-1, keepdims=True)
    att_out = np.einsum('hij,hjk->hik', att_weights, V)
    att_concat = np.concatenate([att_out[0], att_out[1]], axis=-1)
    att_projected = att_concat @ W_o

    # Add & Norm
    def norm(x):
        eps = 1e-6
        mean = np.mean(x, axis=-1, keepdims=True)
        std = np.std(x, axis=-1, keepdims=True)
        return (x - mean) / (std + eps)

    res1 = x_embed + att_projected
    norm1 = norm(res1)

    # ----- FFN -----
    ff = np.maximum(0, norm1 @ W1 + b1) @ W2 + b2

    # Add & Norm
    res2 = norm1 + ff
    norm2 = norm(res2)

    # ----- CLS Token -----
    cls_token = norm2[0]
    logit = cls_token @ W_cls + b_cls
    prob = 1 / (1 + np.exp(-logit))  # sigmoid
    loss = -(y * np.log(prob + 1e-6) + (1 - y) * np.log(1 - prob + 1e-6))

    print(f"[{i}] Prob: {prob}, Label: {y}, Loss: {loss}")

    # ----- BACKPROP (manual grad descent, simple) -----
    dlogit = prob - y  # deriv dari sigmoid + BCE
    dW_cls = cls_token[:, None] * dlogit  # outer product
    db_cls = dlogit

    # Update weights
    W_cls -= lr * dW_cls
    b_cls -= lr * db_cls
