# FINE TUNING - SENTIMENT ANALYSIS

| Step | Penjelasan                           | Status |
| :--: | :----------------------------------- | :----: |
|   1  | Bangun Mini-BERT Stack               |    ✅   |
|   2  | Pretraining (Masked LM + NSP)        |   ✅    |
|   3  | Fine-tuning ke task spesifik         |   NOW   |
|   4  | Buat dataset dummy buat latihan      |   🔜   |
|   5  | Build mindset & intuition level dewa |   🔜   |

In [None]:
## DATA

# sentences -> label 1 = positif, 0 = negatif
sentences = [
    ("aku suka banget sama film ini", 1),
    ("filmnya bener-bener membosankan", 0),
    ("aktingnya luar biasa", 1),
    ("ngantuk banget pas nonton", 0),
    ("ceritanya bikin terharu", 1),
    ("gak masuk akal dan jelek", 0),
]

## Vocab

In [141]:
vocab = {}
vocab['[CLS]'] = 1
vocab['[PAD]'] = 0

vocab

{'[CLS]': 1, '[PAD]': 0}

## W2I and I2W

In [142]:
idx = 2

for sentence, labels in sentences:
    for word in sentence.split():
        vocab[word] = idx
        idx += 1
    

w2id = {word : idx for idx, word in enumerate(vocab)}
id2w = {idx : word for word, idx in vocab.items()}

print(w2id)
print()
print(id2w)

{'[CLS]': 0, '[PAD]': 1, 'aku': 2, 'suka': 3, 'banget': 4, 'sama': 5, 'film': 6, 'ini': 7, 'filmnya': 8, 'bener-bener': 9, 'membosankan': 10, 'aktingnya': 11, 'luar': 12, 'biasa': 13, 'ngantuk': 14, 'pas': 15, 'nonton': 16, 'ceritanya': 17, 'bikin': 18, 'terharu': 19, 'gak': 20, 'masuk': 21, 'akal': 22, 'dan': 23, 'jelek': 24}

{1: '[CLS]', 0: '[PAD]', 2: 'aku', 3: 'suka', 15: 'banget', 5: 'sama', 6: 'film', 7: 'ini', 8: 'filmnya', 9: 'bener-bener', 10: 'membosankan', 11: 'aktingnya', 12: 'luar', 13: 'biasa', 14: 'ngantuk', 16: 'pas', 17: 'nonton', 18: 'ceritanya', 19: 'bikin', 20: 'terharu', 21: 'gak', 22: 'masuk', 23: 'akal', 24: 'dan', 25: 'jelek'}


## TOKENIZE

In [143]:
def tokenize(sentence, max_len=10):
    tokens = sentence.lower().split()
    ids = [vocab["[CLS]"]] + [vocab.get(tok, 0) for tok in tokens]
    # Padding
    if len(ids) < max_len:
        ids += [vocab["[PAD]"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

## Input and Lables

In [144]:
import numpy as np

input = []
lables = []

for sentence, labels in sentences:
    ids = tokenize(sentence)
    input.append(ids)
    lables.append(labels)

input = np.array(input)
labels = np.array(lables)

In [145]:
print(input)
print()
print(labels)
print()
print(f'WORD TO ID : {input[0]} | LABEL : {labels[0]}')
words = [id2w[i] for i in input[0]]
print(f'ID to WORD : {words} | LABEL : {labels[0]}')

[[ 1  2  3 15  5  6  7  0  0  0]
 [ 1  8  9 10  0  0  0  0  0  0]
 [ 1 11 12 13  0  0  0  0  0  0]
 [ 1 14 15 16 17  0  0  0  0  0]
 [ 1 18 19 20  0  0  0  0  0  0]
 [ 1 21 22 23 24 25  0  0  0  0]]

[1 0 1 0 1 0]

WORD TO ID : [ 1  2  3 15  5  6  7  0  0  0] | LABEL : 1
ID to WORD : ['[CLS]', 'aku', 'suka', 'banget', 'sama', 'film', 'ini', '[PAD]', '[PAD]', '[PAD]'] | LABEL : 1


## Embedding

In [146]:
dim = 4

np.random.seed(4)
embedding_matrix = np.random.randn(len(vocab)+1, dim)
embedding_matrix

array([[ 0.0506,  0.5   , -0.9959,  0.6936],
       [-0.4183, -1.5846, -0.6477,  0.5986],
       [ 0.3323, -1.1475,  0.6187, -0.088 ],
       [ 0.4251,  0.3323, -1.1568,  0.351 ],
       [-0.6069,  1.547 ,  0.7233,  0.0461],
       [-0.983 ,  0.0544,  0.1599, -1.2089],
       [ 2.2234,  0.3943,  1.6924, -1.1128],
       [ 1.6357, -1.361 , -0.6512,  0.5425],
       [ 0.048 , -2.3581, -1.1056,  0.8378],
       [ 2.0879,  0.9148, -0.2762,  0.7965],
       [-1.1438,  0.5099, -1.3475, -0.0094],
       [-0.1307,  0.8021, -0.303 ,  1.202 ],
       [-0.1967,  0.8365,  0.7866, -1.8409],
       [ 0.0375,  0.0359, -0.7787,  0.1794],
       [-1.4555,  0.5562,  0.5098,  0.3004],
       [ 2.4766,  0.3523,  0.0675, -0.7323],
       [ 0.2971, -0.9618,  1.2718, -0.6476],
       [ 0.1585,  1.9901,  1.1642,  0.2427],
       [ 1.3799, -0.0546,  0.7952,  0.0191],
       [-0.9054,  0.4303,  0.9347, -0.3461],
       [-1.0971, -0.5282, -2.3798, -0.6077],
       [-1.0753,  2.0224, -0.5649, -1.5429],
       [ 0

In [147]:
input_embedding = embedding_matrix[input]
input_embedding

array([[[-0.4183, -1.5846, -0.6477,  0.5986],
        [ 0.3323, -1.1475,  0.6187, -0.088 ],
        [ 0.4251,  0.3323, -1.1568,  0.351 ],
        [ 2.4766,  0.3523,  0.0675, -0.7323],
        [-0.983 ,  0.0544,  0.1599, -1.2089],
        [ 2.2234,  0.3943,  1.6924, -1.1128],
        [ 1.6357, -1.361 , -0.6512,  0.5425],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936]],

       [[-0.4183, -1.5846, -0.6477,  0.5986],
        [ 0.048 , -2.3581, -1.1056,  0.8378],
        [ 2.0879,  0.9148, -0.2762,  0.7965],
        [-1.1438,  0.5099, -1.3475, -0.0094],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936],
        [ 0.0506,  0.5   , -0.9959,  0.6936]],

       [[-0.4183, -1.5846, -0.6477,  0.5986],
        [-0.1307,  0.8021, -0.

In [148]:
for i, (sentence_ids, label) in enumerate(zip(input, labels)):
    print(f"Kalimat ke-{i+1} (Label: {label}):")
    
    # Ambil kata aslinya dari id
    words = [id2w[id] for id in sentence_ids]

    for j, (token, idx) in enumerate(zip(words, sentence_ids)):
        emb = embedding_matrix[idx]
        token_display = token if token != "[PAD]" else "[PAD]".ljust(10)
        print(f"  Token {j+1:2}: {token_display:<10} | ID: {idx:<2} | Embedding: {np.round(emb, 3)}")

Kalimat ke-1 (Label: 1):
  Token  1: [CLS]      | ID: 1  | Embedding: [-0.418 -1.585 -0.648  0.599]
  Token  2: aku        | ID: 2  | Embedding: [ 0.332 -1.147  0.619 -0.088]
  Token  3: suka       | ID: 3  | Embedding: [ 0.425  0.332 -1.157  0.351]
  Token  4: banget     | ID: 15 | Embedding: [ 2.477  0.352  0.067 -0.732]
  Token  5: sama       | ID: 5  | Embedding: [-0.983  0.054  0.16  -1.209]
  Token  6: film       | ID: 6  | Embedding: [ 2.223  0.394  1.692 -1.113]
  Token  7: ini        | ID: 7  | Embedding: [ 1.636 -1.361 -0.651  0.542]
  Token  8: [PAD]      | ID: 0  | Embedding: [ 0.051  0.5   -0.996  0.694]
  Token  9: [PAD]      | ID: 0  | Embedding: [ 0.051  0.5   -0.996  0.694]
  Token 10: [PAD]      | ID: 0  | Embedding: [ 0.051  0.5   -0.996  0.694]
Kalimat ke-2 (Label: 0):
  Token  1: [CLS]      | ID: 1  | Embedding: [-0.418 -1.585 -0.648  0.599]
  Token  2: filmnya    | ID: 8  | Embedding: [ 0.048 -2.358 -1.106  0.838]
  Token  3: bener-bener | ID: 9  | Embedding: [ 2.

## POSITIONAL ENCODING

In [149]:
max_len = 10

pos = np.arange(max_len)[:, np.newaxis]  # (10, 1)
i = np.arange(dim)[np.newaxis, :]        # (1, 8)

print("Posisi (pos):")
print(pos)
print("\nDimensi (i):")
print(i)

# Hitung angle rates
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / dim)
print("\nAngle Rates (1 / 10000^(2i/dim)):")
np.set_printoptions(precision=4, suppress=True)
print(angle_rates)

# Hitung angle radians
angle_rads = pos * angle_rates
print("\nAngle Radians (pos * angle_rates):")
print(angle_rads)

# Terapkan sin ke index genap dan cos ke index ganjil
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # even
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # odd

print("\nPositional Encoding (final):")
for idx, row in enumerate(angle_rads):
    print(f"Pos {idx:2d}: {row}")

Posisi (pos):
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]

Dimensi (i):
[[0 1 2 3]]

Angle Rates (1 / 10000^(2i/dim)):
[[1.   1.   0.01 0.01]]

Angle Radians (pos * angle_rates):
[[0.   0.   0.   0.  ]
 [1.   1.   0.01 0.01]
 [2.   2.   0.02 0.02]
 [3.   3.   0.03 0.03]
 [4.   4.   0.04 0.04]
 [5.   5.   0.05 0.05]
 [6.   6.   0.06 0.06]
 [7.   7.   0.07 0.07]
 [8.   8.   0.08 0.08]
 [9.   9.   0.09 0.09]]

Positional Encoding (final):
Pos  0: [0. 1. 0. 1.]
Pos  1: [0.8415 0.5403 0.01   1.    ]
Pos  2: [ 0.9093 -0.4161  0.02    0.9998]
Pos  3: [ 0.1411 -0.99    0.03    0.9996]
Pos  4: [-0.7568 -0.6536  0.04    0.9992]
Pos  5: [-0.9589  0.2837  0.05    0.9988]
Pos  6: [-0.2794  0.9602  0.06    0.9982]
Pos  7: [0.657  0.7539 0.0699 0.9976]
Pos  8: [ 0.9894 -0.1455  0.0799  0.9968]
Pos  9: [ 0.4121 -0.9111  0.0899  0.996 ]


## EMBEDDING + POSITIONAL ENCODING

In [150]:
# Tambahkan positional encoding
pe = input_embedding + angle_rads[np.newaxis, :input_embedding.shape[1], :]

pe

array([[[-0.4183, -0.5846, -0.6477,  1.5986],
        [ 1.1737, -0.6072,  0.6287,  0.912 ],
        [ 1.3344, -0.0839, -1.1368,  1.3508],
        [ 2.6177, -0.6376,  0.0975,  0.2673],
        [-1.7398, -0.5992,  0.1999, -0.2097],
        [ 1.2644,  0.678 ,  1.7423, -0.1141],
        [ 1.3563, -0.4008, -0.5913,  1.5407],
        [ 0.7075,  1.2539, -0.926 ,  1.6911],
        [ 1.0399,  0.3545, -0.916 ,  1.6904],
        [ 0.4627, -0.4112, -0.906 ,  1.6896]],

       [[-0.4183, -0.5846, -0.6477,  1.5986],
        [ 0.8895, -1.8178, -1.0956,  1.8378],
        [ 2.9972,  0.4987, -0.2562,  1.7963],
        [-1.0027, -0.4801, -1.3175,  0.9902],
        [-0.7062, -0.1537, -0.9559,  1.6928],
        [-0.9084,  0.7836, -0.9459,  1.6923],
        [-0.2289,  1.4601, -0.9359,  1.6918],
        [ 0.7075,  1.2539, -0.926 ,  1.6911],
        [ 1.0399,  0.3545, -0.916 ,  1.6904],
        [ 0.4627, -0.4112, -0.906 ,  1.6896]],

       [[-0.4183, -0.5846, -0.6477,  1.5986],
        [ 0.7108,  1.3424, -0.

In [221]:
# Tampilkan hasil
# Tampilkan Projection Input terhadap Q , K dan V untuk setiap token pada setiap batch untuk setiap head

for batch_idx, (sentence_ids, sentence_pe) in enumerate(zip(input, pe)):
    print(f"\n[Batch {batch_idx}]")
    
    # Ambil kalimat asli dari ID
    sentence_ids = input[batch_idx]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print(f"Kalimat {batch_idx} :", " ".join(sentence_words))
    print("Token IDs :", sentence_ids)
    
    for pos, (token_id, token_pe) in enumerate(zip(sentence_ids, sentence_pe)):
        word = id2word.get(token_id, "[UNK]")
        print(f"Pos {pos:2d} | Word: {word:<10} | ID: {token_id:<2d} | Embedding + PE: {token_pe}")


[Batch 0]
Kalimat 0 : [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Token IDs : [ 1  2  3 15  5  6  7  0  0  0]
Pos  0 | Word: [CLS]      | ID: 1  | Embedding + PE: [-0.4183 -0.5846 -0.6477  1.5986]
Pos  1 | Word: aku        | ID: 2  | Embedding + PE: [ 1.1737 -0.6072  0.6287  0.912 ]
Pos  2 | Word: suka       | ID: 3  | Embedding + PE: [ 1.3344 -0.0839 -1.1368  1.3508]
Pos  3 | Word: banget     | ID: 15 | Embedding + PE: [ 2.6177 -0.6376  0.0975  0.2673]
Pos  4 | Word: sama       | ID: 5  | Embedding + PE: [-1.7398 -0.5992  0.1999 -0.2097]
Pos  5 | Word: film       | ID: 6  | Embedding + PE: [ 1.2644  0.678   1.7423 -0.1141]
Pos  6 | Word: ini        | ID: 7  | Embedding + PE: [ 1.3563 -0.4008 -0.5913  1.5407]
Pos  7 | Word: [PAD]      | ID: 0  | Embedding + PE: [ 0.7075  1.2539 -0.926   1.6911]
Pos  8 | Word: [PAD]      | ID: 0  | Embedding + PE: [ 1.0399  0.3545 -0.916   1.6904]
Pos  9 | Word: [PAD]      | ID: 0  | Embedding + PE: [ 0.4627 -0.4112 -0.906   1.6896]

[Batch 1

## MULTIHEAD SELF ATTENTION

In [152]:
# Parameters
batch_size, seq_len, embed_dim = pe.shape
num_heads = 4
head_dim = embed_dim // num_heads

print("batch_size:", batch_size)
print("seq_len   :", seq_len)
print("embed_dim :", embed_dim)
print("num_heads :", num_heads)
print("head_dim  :", head_dim)

batch_size: 6
seq_len   : 10
embed_dim : 4
num_heads : 4
head_dim  : 1


In [198]:
# Weights
wq = np.random.randn(num_heads, embed_dim, head_dim)
wk = np.random.randn(num_heads, embed_dim, head_dim)
wv = np.random.randn(num_heads, embed_dim, head_dim)
wo = np.random.randn(num_heads * head_dim, embed_dim)

In [199]:
print("wq shape:", wq.shape)
print("wk shape:", wk.shape)
print("wv shape:", wv.shape)
print("wo shape:", wo.shape)

wq shape: (4, 4, 1)
wk shape: (4, 4, 1)
wv shape: (4, 4, 1)
wo shape: (4, 4)


In [200]:
# Proyeksi Q, K, V
# hasil: (batch, head, seq_len, head_dim)
Q = np.einsum('bse,hed->bhsd', pe, wq)
K = np.einsum('bse,hed->bhsd', pe, wk)
V = np.einsum('bse,hed->bhsd', pe, wv)

In [201]:
print("wq shape:", Q.shape)
print("wk shape:", K.shape)
print("wv shape:", V.shape)

wq shape: (6, 4, 10, 1)
wk shape: (6, 4, 10, 1)
wv shape: (6, 4, 10, 1)


In [202]:
# Tampilkan Projection Input terhadap Q , K dan V untuk setiap token pada setiap batch untuk setiap head
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    
    # Ambil kalimat asli dari ID
    sentence_ids = input[b]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print("Kalimat {b}  :", " ".join(sentence_words))
    print("Token IDs :", sentence_ids)
    
    for h in range(num_heads):
        print(f"\nHead {h} : ")
        for t in range(seq_len):
            word_id = sentence_ids[t]
            word = id2w.get(word_id, "[UNK]")
            att_vector_q = Q[b, h, t]  # shape: (head_dim,)
            att_vector_k = K[b, h, t]  # shape: (head_dim,)
            att_vector_v = V[b, h, t]  # shape: (head_dim,)
            pe_input = pe[b, t]  # shape: (seq_len, head_dim)
            
            print(f"Token {t:2d} | Word: {word:<10} | ID: {word_id:<2d} | PE Input: {pe_input}| Attention Vector Q (dim={head_dim}): {att_vector_q} | K (dim={head_dim}): {att_vector_k} | V (dim={head_dim}): {att_vector_v}")


[Batch 0]
Kalimat {b}  : [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Token IDs : [ 1  2  3 15  5  6  7  0  0  0]

Head 0 : 
Token  0 | Word: [CLS]      | ID: 1  | PE Input: [-0.4183 -0.5846 -0.6477  1.5986]| Attention Vector Q (dim=1): [1.7251] | K (dim=1): [-1.3881] | V (dim=1): [1.9028]
Token  1 | Word: aku        | ID: 2  | PE Input: [ 1.1737 -0.6072  0.6287  0.912 ]| Attention Vector Q (dim=1): [1.3113] | K (dim=1): [-1.9452] | V (dim=1): [1.6186]
Token  2 | Word: suka       | ID: 3  | PE Input: [ 1.3344 -0.0839 -1.1368  1.3508]| Attention Vector Q (dim=1): [1.0459] | K (dim=1): [-1.7432] | V (dim=1): [0.1249]
Token  3 | Word: banget     | ID: 15 | PE Input: [ 2.6177 -0.6376  0.0975  0.2673]| Attention Vector Q (dim=1): [1.1845] | K (dim=1): [-1.4205] | V (dim=1): [-0.7512]
Token  4 | Word: sama       | ID: 5  | PE Input: [-1.7398 -0.5992  0.1999 -0.2097]| Attention Vector Q (dim=1): [0.6314] | K (dim=1): [1.1702] | V (dim=1): [0.6748]
Token  5 | Word: film       | ID: 6

In [203]:
# Scaled dot-product attention
# (b, h, s, s)
scores = np.einsum('bhsd,bhtd->bhst', Q, K) / np.sqrt(head_dim)

# Softmax
scores -= np.max(scores, axis=-1, keepdims=True)
att_weights = np.exp(scores)
att_weights /= np.sum(att_weights, axis=-1, keepdims=True)
print(att_weights.shape)

(6, 4, 10, 10)


In [204]:
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    
    # Ambil kalimat dan ID
    sentence_ids = input[b]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print("Kalimat:", " ".join(sentence_words))
    print("Token IDs:", sentence_ids)
    
    for h in range(num_heads):
        print(f"\n  Head {h}:")
        for t in range(seq_len):
            word_id = sentence_ids[t]
            word = id2w.get(word_id, "[UNK]")
            pe_input = pe[b, t]  # (embed_dim,)
            att_vector_q = Q[b, h, t]  # (head_dim,)
            att_vector_k = K[b, h, t]
            att_vector_v = V[b, h, t]

            print(f"\n    Token {t:2d} | Word: {word:<10} | ID: {word_id:<2d}")
            print(f"      PE Input       : {pe_input}")
            print(f"      Q (dim={head_dim}): {att_vector_q}")
            print(f"      K (dim={head_dim}): {att_vector_k}")
            print(f"      V (dim={head_dim}): {att_vector_v}")
            print(f"      Attention to:")

            for k in range(seq_len):
                key_word = id2w.get(sentence_ids[k], "[UNK]")
                score = scores[b, h, t, k]
                weight = att_weights[b, h, t, k]
                print(f"        ↳ Token {k:2d} [{key_word:<10}] | Score: {score:>7.4f} | Softmax: {weight:>7.4f}")


[Batch 0]
Kalimat: [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Token IDs: [ 1  2  3 15  5  6  7  0  0  0]

  Head 0:

    Token  0 | Word: [CLS]      | ID: 1 
      PE Input       : [-0.4183 -0.5846 -0.6477  1.5986]
      Q (dim=1): [1.7251]
      K (dim=1): [-1.3881]
      V (dim=1): [1.9028]
      Attention to:
        ↳ Token  0 [[CLS]     ] | Score: -4.4135 | Softmax:  0.0115
        ↳ Token  1 [aku       ] | Score: -5.3745 | Softmax:  0.0044
        ↳ Token  2 [suka      ] | Score: -5.0261 | Softmax:  0.0062
        ↳ Token  3 [banget    ] | Score: -4.4693 | Softmax:  0.0109
        ↳ Token  4 [sama      ] | Score:  0.0000 | Softmax:  0.9481
        ↳ Token  5 [film      ] | Score: -4.9265 | Softmax:  0.0069
        ↳ Token  6 [ini       ] | Score: -5.8603 | Softmax:  0.0027
        ↳ Token  7 [[PAD]     ] | Score: -6.2972 | Softmax:  0.0017
        ↳ Token  8 [[PAD]     ] | Score: -6.0571 | Softmax:  0.0022
        ↳ Token  9 [[PAD]     ] | Score: -5.1659 | Softmax:  0

In [206]:
# Attention output: (b, h, s, d)
att_out = np.einsum('bhst,bhtd->bhsd', att_weights, V)

# Concatenate heads: (b, s, h*d)
att_concat = att_out.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, -1)

# Output projection: (b, s, embed_dim)
att_projected = np.einsum('bsd,df->bsf', att_concat, wo)

print("Multi-Head Attention Output Shape:", att_projected.shape)

Multi-Head Attention Output Shape: (6, 10, 4)


In [208]:
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    sentence_ids = input[b]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print("Kalimat:", " ".join(sentence_words))
    print("Token IDs:", sentence_ids)

    for t in range(seq_len):
        word_id = sentence_ids[t]
        word = id2w.get(word_id, "[UNK]")
        pe_input = pe[b, t]                      # (embed_dim,)
        concat_output = att_concat[b, t]         # (num_heads * head_dim,)
        final_output = att_projected[b, t]       # (embed_dim,)

        print(f"\nToken {t:2d} | Word: {word:<10}")
        print(f"  ➤ PE Input         : {pe_input}")
        
        # Output dari masing-masing head
        for h in range(num_heads):
            att_out_vector = att_out[b, h, t]    # (head_dim,)
            print(f"  ➤ Head {h:<2d} Output  : {att_out_vector}")
        
        print(f"  ➤ Concat Output    : {concat_output}")
        print(f"  ➤ Final Projection : {final_output}")


[Batch 0]
Kalimat: [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Token IDs: [ 1  2  3 15  5  6  7  0  0  0]

Token  0 | Word: [CLS]     
  ➤ PE Input         : [-0.4183 -0.5846 -0.6477  1.5986]
  ➤ Head 0  Output  : [0.6861]
  ➤ Head 1  Output  : [-0.8746]
  ➤ Head 2  Output  : [0.903]
  ➤ Head 3  Output  : [1.2625]
  ➤ Concat Output    : [ 0.6861 -0.8746  0.903   1.2625]
  ➤ Final Projection : [-1.8126  1.7303  3.55    2.3894]

Token  1 | Word: aku       
  ➤ PE Input         : [ 1.1737 -0.6072  0.6287  0.912 ]
  ➤ Head 0  Output  : [0.7123]
  ➤ Head 1  Output  : [0.5155]
  ➤ Head 2  Output  : [-1.9011]
  ➤ Head 3  Output  : [1.291]
  ➤ Concat Output    : [ 0.7123  0.5155 -1.9011  1.291 ]
  ➤ Final Projection : [ 1.2155 -3.3812 -4.3749  1.7653]

Token  2 | Word: suka      
  ➤ PE Input         : [ 1.3344 -0.0839 -1.1368  1.3508]
  ➤ Head 0  Output  : [0.7493]
  ➤ Head 1  Output  : [1.9482]
  ➤ Head 2  Output  : [-1.884]
  ➤ Head 3  Output  : [0.821]
  ➤ Concat Output    : [ 0

## RESIDUAL CONNECTIONS AND LAYERNORM

In [210]:
att_projected.shape, pe.shape

((6, 10, 4), (6, 10, 4))

In [256]:
def norm(x):
    eps = 1e-6
    mean = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + eps)

# Residual + norm
add = pe + att_projected
norm1 = norm(add)

In [229]:
# Tampilkan hasil

for batch_idx, (sentence_ids, sentence_pe) in enumerate(zip(input, norm1)):
    print(f"\n[Batch {batch_idx}]")
    
    # Ambil kalimat asli dari ID
    sentence_ids = input[batch_idx]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    print(f"Kalimat {batch_idx} :", " ".join(sentence_words))
    print("Token IDs :", sentence_ids)
    
    for pos, (token_id, token_pe) in enumerate(zip(sentence_ids, sentence_pe)):
        word = id2word.get(token_id, "[UNK]")
        print(f"Word: {word:<10} | ID: {token_id:<2d} | ADD AND NORM: {token_pe}")
        


[Batch 0]
Kalimat 0 : [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Token IDs : [ 1  2  3 15  5  6  7  0  0  0]
Word: [CLS]      | ID: 1  | ADD AND NORM: [-1.5633 -0.1297  0.616   1.077 ]
Word: aku        | ID: 2  | ADD AND NORM: [ 0.9542 -1.0369 -0.9613  1.0441]
Word: suka       | ID: 3  | ADD AND NORM: [ 1.1306 -0.5444 -1.3546  0.7684]
Word: banget     | ID: 15 | ADD AND NORM: [ 1.268  -1.0553 -0.8988  0.6861]
Word: sama       | ID: 5  | ADD AND NORM: [-1.6252  0.0776  1.0523  0.4953]
Word: film       | ID: 6  | ADD AND NORM: [ 1.2889 -1.3171 -0.5352  0.5634]
Word: ini        | ID: 7  | ADD AND NORM: [ 0.9943 -0.7198 -1.2453  0.9708]
Word: [PAD]      | ID: 0  | ADD AND NORM: [ 1.0412 -0.4612 -1.4159  0.8358]
Word: [PAD]      | ID: 0  | ADD AND NORM: [ 1.0807 -0.5952 -1.329   0.8435]
Word: [PAD]      | ID: 0  | ADD AND NORM: [-0.2544 -0.7893 -0.6545  1.6982]

[Batch 1]
Kalimat 1 : [CLS] filmnya bener-bener membosankan [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Token IDs : [ 1  8  9 

## FFN

In [230]:
norm1.shape

(6, 10, 4)

In [246]:
batch_size, seq_len, embed_dim = pe.shape
ff_dim = 2 * embed_dim

In [248]:
W1 = np.random.randn(embed_dim, ff_dim)
b1 = np.zeros(ff_dim)
W2 = np.random.randn(ff_dim, embed_dim)
b2 = np.zeros(embed_dim)

# Dense -> ReLU -> Dense

expanded_input = norm1 @ W1 + b1

relu = np.maximum(0, expanded_input) 

compressed_output = relu @ W2 + b2


In [253]:
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    sentence_ids = input[b]
    print("Kalimat:", " ".join([id2w.get(i, "[UNK]") for i in sentence_ids]))

    for t in range(seq_len):
        token_id = sentence_ids[t]
        word = id2w.get(token_id, "[UNK]")
        token_pe = norm1[b, t]
        expand = expanded_input[b, t]
        activated = relu[b, t]
        out = compressed_output[b, t]

        print(f"Word: {word:<10} | ID: {token_id:<2d} | ADD AND NORM: {token_pe} | W1: {expand} | ReLU: {activated} | W2 or Output : {out}")


[Batch 0]
Kalimat: [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Word: [CLS]      | ID: 1  | ADD AND NORM: [-1.5633 -0.1297  0.616   1.077 ] | W1: [-0.5983  0.1633 -3.0538 -0.8817  3.3305  2.9852 -2.3336  1.1677] | ReLU: [0.     0.1633 0.     0.     3.3305 2.9852 0.     1.1677] | W2 or Output : [-5.5483 -0.466   2.5284  0.5151]
Word: aku        | ID: 2  | ADD AND NORM: [ 0.9542 -1.0369 -0.9613  1.0441] | W1: [-0.3712 -5.0422 -0.0992 -2.9728  0.309  -0.5644 -0.4888 -1.4358] | ReLU: [0.    0.    0.    0.    0.309 0.    0.    0.   ] | W2 or Output : [-0.1003 -0.203  -0.3269  0.3114]
Word: suka       | ID: 3  | ADD AND NORM: [ 1.1306 -0.5444 -1.3546  0.7684] | W1: [-0.9566 -3.8432  0.0553 -2.017  -0.6235 -0.4133 -0.4346 -1.2399] | ReLU: [0.     0.     0.0553 0.     0.     0.     0.     0.    ] | W2 or Output : [-0.0664  0.024  -0.0249  0.0264]
Word: banget     | ID: 15 | ADD AND NORM: [ 1.268  -1.0553 -0.8988  0.6861] | W1: [ 0.0726 -4.9436  0.8115 -2.6995 -0.4291 -1.4878  0.2785 

In [259]:
add1 = norm1 + compressed_output
norm2 = norm(add1)

In [261]:
for b in range(batch_size):
    print(f"\n[Batch {b}]")
    sentence_ids = input[b]
    print("Kalimat:", " ".join([id2w.get(i, "[UNK]") for i in sentence_ids]))

    for t in range(seq_len):
        token_id = sentence_ids[t]
        word = id2w.get(token_id, "[UNK]")
        token_pe1 = norm1[b, t]
        token_pe2 = norm2[b, t]


        print(f"Word: {word:<10} | ID: {token_id:<2d} | ADD AND NORM 1: {token_pe1} | FFN Output : {out} | ADD AND NORM 2: {token_pe2}")


[Batch 0]
Kalimat: [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Word: [CLS]      | ID: 1  | ADD AND NORM 1: [-1.5633 -0.1297  0.616   1.077 ] | FFN Output : [-3.235  -0.4608  0.3417  0.8975] | ADD AND NORM 2: [-1.629   0.0376  0.9942  0.5972]
Word: aku        | ID: 2  | ADD AND NORM 1: [ 0.9542 -1.0369 -0.9613  1.0441] | FFN Output : [-3.235  -0.4608  0.3417  0.8975] | ADD AND NORM 2: [ 0.7795 -0.9687 -1.009   1.1983]
Word: suka       | ID: 3  | ADD AND NORM 1: [ 1.1306 -0.5444 -1.3546  0.7684] | FFN Output : [-3.235  -0.4608  0.3417  0.8975] | ADD AND NORM 2: [ 1.0829 -0.5141 -1.3801  0.8113]
Word: banget     | ID: 15 | ADD AND NORM 1: [ 1.268  -1.0553 -0.8988  0.6861] | FFN Output : [-3.235  -0.4608  0.3417  0.8975] | ADD AND NORM 2: [ 0.8124 -0.9112 -1.0697  1.1685]
Word: sama       | ID: 5  | ADD AND NORM 1: [-1.6252  0.0776  1.0523  0.4953] | FFN Output : [-3.235  -0.4608  0.3417  0.8975] | ADD AND NORM 2: [-1.5423 -0.1035  1.1807  0.4652]
Word: film       | ID: 6  | ADD

## CLASSIFICATION HEAD

In [282]:
cls_output = norm2[:, 0, :]
cls_output

array([[-1.629 ,  0.0376,  0.9942,  0.5972],
       [-1.7093,  0.727 ,  0.3076,  0.6746],
       [-1.629 ,  0.922 ,  0.7044,  0.0026],
       [-0.8063, -0.4968, -0.4101,  1.7132],
       [-1.338 ,  0.8559,  1.066 , -0.5839],
       [-1.4978, -0.2378,  0.5566,  1.179 ]])

In [283]:
num_classes = 2
W_cls = np.random.randn(embed_dim, num_classes)
b_cls = np.zeros(num_classes)

logits = cls_output @ W_cls + b_cls  # (batch_size, num_classes)

print(logits)


[[ 3.8097 -1.2682]
 [ 3.6546 -1.3433]
 [ 4.066   0.2226]
 [ 0.5987 -3.9558]
 [ 3.9202  1.5424]
 [ 2.9495 -2.6511]]


In [296]:
from scipy.special import softmax

probs = softmax(logits, axis=-1)
probs

array([[0.9975, 0.0025],
       [0.9975, 0.0025],
       [0.9781, 0.0219],
       [0.9933, 0.0067],
       [0.8455, 0.1545],
       [0.9989, 0.0011]])

In [301]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def cross_entropy(pred, true_label):
    return -np.log(pred[true_label] + 1e-9)

for b in range(batch_size):
    sentence_ids = input[b]
    sentence_words = [id2w.get(i, "[UNK]") for i in sentence_ids]
    true_label = labels[b]
    
    # Ambil vektor CLS
    cls_vec = norm2[b, 0]
    
    # Logits & Probabilities
    logit = cls_vec @ W_cls + b_cls
    prob = softmax(logit)
    pred_label = np.argmax(prob)
    loss = cross_entropy(prob, true_label)
    
    # Tampilkan
    print(f"\n[Batch {b}]")
    print(f"Kalimat       : {' '.join(sentence_words)}")
    print(f"Label Asli    : {true_label}")
    print(f"Prediksi      : [{pred_label}]")
    print(f"Logits        : {logit}")
    print(f"Probabilitas  : {prob}")
    print(f"Loss (CE)     : {loss}")


[Batch 0]
Kalimat       : [CLS] aku suka banget sama film ini [PAD] [PAD] [PAD]
Label Asli    : [1]
Prediksi      : [0]
Logits        : [ 3.8097 -1.2682]
Probabilitas  : [0.9938 0.0062]
Loss (CE)     : [5.0841]

[Batch 1]
Kalimat       : [CLS] filmnya bener-bener membosankan [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Label Asli    : [0]
Prediksi      : [0]
Logits        : [ 3.6546 -1.3433]
Probabilitas  : [0.9933 0.0067]
Loss (CE)     : [0.0067]

[Batch 2]
Kalimat       : [CLS] aktingnya luar biasa [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Label Asli    : [1]
Prediksi      : [0]
Logits        : [4.066  0.2226]
Probabilitas  : [0.979 0.021]
Loss (CE)     : [3.8645]

[Batch 3]
Kalimat       : [CLS] ngantuk banget pas nonton [PAD] [PAD] [PAD] [PAD] [PAD]
Label Asli    : [0]
Prediksi      : [0]
Logits        : [ 0.5987 -3.9558]
Probabilitas  : [0.9896 0.0104]
Loss (CE)     : [0.0105]

[Batch 4]
Kalimat       : [CLS] ceritanya bikin terharu [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Label Asli    : [1]
Predik

## Full Program For FINE TUNING BERT to SENTIMENT ANALYSIS

1. Data Processing:

- Sample sentiment analysis dataset (positive/negative movie reviews)
- Custom vocabulary with special tokens ([CLS], [PAD], [UNK])
- Tokenization of sentences


2. Model Architecture:

- Token embeddings
- Positional encoding
- Multi-head attention mechanism (the core of BERT)
- Feed-forward networks
- Layer normalization
- Complete transformer encoder layers
- Classification head for sentiment prediction


3. Training Pipeline:

- Data loading and batching
- Model training with cross-entropy loss
- Evaluation function
- Progress tracking and visualization


4. Inference:

- Function to predict sentiment on new sentences

In [None]:
import numpy as np

class Vocab:
    def __init__(self):
        self.vocab = {
            '[PAD]' : 0,
            '[CLS]' : 1,
        }
        self.w2i = {}
        self.i2w = {}
    
    def build_vocab(self, sentences):
        start_idx = len(self.vocab)
        
        for sentence, _ in sentences:
            for word in sentence.lower().split():
                if word not in self.vocab:
                    self.vocab[word] = start_idx
                    start_idx += 1
        
        # mapping
        self.w2i = {w: i  for i, w in enumerate(self.vocab)}
        self.i2w = {i: w for w, i in self.w2i.items()}

        return self.vocab

    def tokenize(self, sentence, max_len=10):
        tokens = sentence.lower().split()
        ids = [self.vocab["[CLS]"]] + [self.vocab.get(tok, 0) for tok in tokens]

        # padding
        if len(ids) < max_len:
            ids += [self.vocab["[PAD]"]] * (max_len - len(ids))
        else:
            ids = ids[:max_len]
        
        return ids

In [None]:
class EmbeddingLayer:
    def __init__(self, vocab_size, d_model):
        np.random.seed(42)
        self.d_model = d_model
        self.embedding_matrix = np.random.rand(vocab_size, d_model)
        self.cache = {}
    
    def forward(self, input_ids, cache=True):
        embeddings = self.embedding_matrix[input_ids]
        batch_size, seq_len, _ = embeddings.shape
        embeddings = np.zeros((batch_size, seq_len, self.d_model))

        for i in range(batch_size):
            for j in range(seq_len):
                word_id = input_ids[i, j]
                embeddings[i,j] = self.embedding_matrix[word_id]
    
        if cache:
            self.cache['input_ids'] = input_ids
        
        return embeddings
    
    def backward(self, dembeddings):
        input_ids = self.cache['input_ids']
        batch_size, seq_len, _ = input_ids.shape


        # initialize gradients
        dembedding_matrix = np.zeros_like(self.embedding_matrix)

        # for each position in batch, accumulate gradients for corresponding words vectors
        for i in range(batch_size):
            for j in range(seq_len):
                word_id = input_ids[i, j]
                dembedding_matrix[word_id] += dembeddings[i,j]
        
        return dembedding_matrix

In [27]:
class PositionalEncoder:
    def __init__(self, max_len, d_model):
        self.max_len = max_len
        self.d_model = d_model
        self.pe = self._create_postional_encoding()
    
    def _create_postional_encoding(self):
        pos = np.arange(self.max_len)[:, np.newaxis]
        i = np.arange(self.d_model)[np.newaxis, :]

        # calculate angle rates
        angle_rates = 1 / np.power(10000, (2*(i//2) / self.d_model))

        # calculate angle radians
        angle_rads = pos * angle_rates

        # apply sin to even indices and cos to odd idices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        return angle_rads
    
    def forward(self, embedding):
        batch_size, seq_len, _ = embedding.shape
        return embedding + self.pe[:seq_len, :]


In [79]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
    
        # weight
        self.wq = np.random.rand(d_model, num_heads * self.head_dim) * 0.01
        self.wk = np.random.rand(d_model, num_heads * self.head_dim) * 0.01
        self.wv = np.random.rand(d_model, num_heads * self.head_dim) * 0.01
        self.wo = np.random.rand(d_model, d_model) * 0.01

        self.cache = {}

    def forward(self, x, cache=True):
        batch_size, seq_len, _ = x.shape

        # project to q, k, v
        q = x @ self.wq
        k = x @ self.wk
        v = x @ self.wv

        # reshape to multi-head format
        # (batch_size, seq_len, num_heads, head_dim)
        q = q.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0,2,1,3)
        k = k.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0,2,1,3)
        v = v.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0,2,1,3)

        # scale dot product attention
        # (batch_size, num_heads, seq_len, head_dim)
        scores = np.matmul(q, k.transpose(0,1,3,2)) / np.sqrt(self.head_dim)

        # softmax
        scores -= np.max(scores, axis=-1, keepdims=True)
        attn_weight = np.exp(scores)
        attn_weight /= np.sum(attn_weight, axis=-1, keepdims=True) + 1e-8

        # attention output
        attn_output = np.einsum('bhst,bhtd->bhsd', attn_weight, v)
        
        # concatenate heads and transpose back to original shape
        # (batch_size, seq_len, d_model)
        attn_concat = attn_output.transpose(0,2,1,3).reshape(batch_size, seq_len, -1)

        # output projection
        # (batch_size, seq_lem, d_model)
        attn_projected = attn_concat @ self.wo

        if cache:
            self.cache['x'] = x
            self.cache['q'] = q
            self.cache['k'] = k
            self.cache['v'] = v
            self.cache['scores'] = scores
            self.cache['attn_weight'] = attn_weight
            self.cache['attn_output'] = attn_output
            self.cache['attn_concat'] = attn_concat

        return attn_projected

    def backward(self, dattn_projected):
        # get from cache
        x = self.cache['x']
        q = self.cache['q']
        k = self.cache['k']
        v = self.cache['v']
        attn_weight = self.cache['attn_weight']
        attn_output = self.cache['attn_output']
        attn_concat = self.cache['attn_concat']

        batch_size, seq_len, _ = x.shape

        # gradient wrt output projection wo
        dwo = attn_concat.transpose(0, 2, 1) @ dattn_projected # (d_model, d_model)

        # gradient wrt concatenated attention heads concat
        dattn_concat = dattn_projected @ self.wo.T # (batch_size, seq_len, d_model)

        # reshape back to multihead-format
        dattn_output = dattn_concat.reshape(batch_size, self.num_heads, self.head_dim).transpose(0,2,1,3)

        # gradient wrt v
        dattn_weight = np.matmul(dattn_output, v.transpose(0, 1, 3, 2))
        dv = np.matmul(attn_weight.transpose(0, 1, 3, 2), dattn_output)

        # gradient through softmax
        dscores = attn_weight * (dattn_weight - np.sum(attn_weight * dattn_weight, axis=-1, keepdims=True)) 

        # gradient wrt q and k
        dk_transpose = np.matmul(q.transpose(0, 1, 3, 2), dscores) / np.sqrt(self.d_model)
        dk = dk_transpose.transpose(0, 1, 3, 2)
        dq = np.matmul(dscores, k) / np.sqrt(self.d_model)

        # reshape gradient to original dimention format
        dq = dq.transpose(0,2,1,3).reshape(batch_size, seq_len, -1)
        dk = dk.transpose(0,2,1,3).reshape(batch_size, seq_len, -1)
        dv = dv.transpose(0,2,1,3).reshape(batch_size, seq_len, -1)

        # gradient wrt input x for q
        dx = (dq @ self.wq.T) + (dk @ self.wk.T) + (dv @ self.wv.T)
        
        # gradient wrt weights
        dwq = x.transpose(0, 2, 1) @ dq
        dwq = np.sum(dwq, axis=0)
        dwk = x.transpose(0, 2, 1) @ dk
        dwk = np.sum(dwk, axis=0)
        dwv = x.transpose(0, 2, 1) @ dv
        dwv = np.sum(dwv, axis=0)

        return dwq, dwk, dwv, dwo, dx

In [80]:
class LayerNorm:
    def __init__(self, d_model, eps=1e-6):
        self.eps = eps

        # scale parameters
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)

        # cache
        self.cache = {}
    
    def forward(self, x, cache=True):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        std = np.sqrt(var + self.eps)
        normalized = (x - mean) / std

        # scale and shift
        out = self.gamma * normalized + self.beta 

        if cache:
            self.cache['x'] = x
            self.cache['mean'] = mean
            self.cache['var'] = var
            self.cache['std'] = std
            self.cache['normalized'] = normalized
        
        return out
    
    def backward(self, dout):
        # get from cache
        x = self.cache['x']
        mean = self.cache['mean']
        std = self.cache['std']
        normalized = self.cache['normalized']
        
        _, _, d_model = x.shape
        
        # gradient wrt gamma and beta
        dgamma = np.sum(dout * normalized, axis=(0,1))
        dbeta = np.sum(dout, axis=(0,1))

        # gradient for later normalization
        dx_normalized = dout * self.gamma

        # gradient wrt var
        dvar = np.sum(dx_normalized * (x- mean) * -0.5 * std**(-3), axis=-1, keepdims=True)

        # gradent wrt mean
        dxmean1 = dx_normalized * -0.1/std
        dxmean2 = dvar * -2.0 * np.mean(x-mean, axis=-1, keepdims=True)
    
        # gradient wrt x
        dx = dx_normalized / std + dvar * 2.0 * (x - mean) / (dxmean1 + dxmean2) / d_model

        return dx, dgamma, dbeta

In [81]:
class FeedFordward:
    def __init__(self, d_model, d_ff=None):

        self.d_model = d_model

        # weights
        self.w1 = np.random.rand(d_model, d_ff) * 0.1
        self.b1 = np.zeros(d_ff) * 0.1
        self.w2 = np.random.rand(d_ff, d_model) * 0.1
        self.b2 = np.zeros(d_model) * 0.1
        
        self.cache = {}
    
    
    def forward(self, x, cache=True):

        _, _, d_model = x.shape

        # linear 1 - Expansion
        f1 = x @ self.w1 + self.b1

        # relu
        r = np.maximum(0, f1)

        # Linear 2 - Compression
        f2 = r @ self.w2 + self.b2

        if cache:
            self.cache['x'] = x
            self.cache['f1'] = f1
            self.cache['r'] = r
        
        return f2
    
    def backward(self, doutput):
        # get from cache
        x = self.cache['x']
        f1 = self.cache['f1']
        r = self.cache['r']

        # gradient wrt w2 and b2
        dr = doutput @ self.w2.T
        dw2 = r.reshape(-1, r.shape[-1].T @ doutput.reshape(-1, doutput.shape[-1]))
        db2 = np.sum(doutput, axis=(0,1))

        # gradient through relu
        df1 = dr*(f1>0)

        # gradient wrt w1 and b1
        dw1 = x.reshape(-1, x.shape[-1]).T @ df1.reshape(-1, df1.shape[-1])
        db1 = np.sum(df1, axis=(0,1))

        # gradient wrt x
        dx = df1 @ self.w1.T
        
        return dw1, db1, db2, dx

In [86]:
class BertModel:
    def __init__(self, d_model, num_heads):
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNorm(d_model)
        self.ff = FeedFordward(d_model, 4*d_model)
        self.norm2 = LayerNorm()
        self.cache = {}
    
    def forward(self, x, cache=True):
        # multi-head attention
        attn_output = self.attention.forward(x)

        # add & norm
        add1 = x + attn_output
        norm1 = self.norm1.forward(add1)

        # feed-forward
        ff_output = self.ff.forward(norm1)

        # add & norm
        add2 = norm1 + ff_output
        norm2 = self.norm2.forward(add2)

        if cache:
            self.cache['x'] = x
            self.cache['attn_output'] = attn_output
            self.cache['add1'] = add1
            self.cache['norm1'] = norm1
            self.cache['ff_output'] = ff_output
            self.cache['add2'] = add2

        return norm2
    
    def backward(self, dout):
        # get from cache
        x = self.cache['x']
        attn_output = self.cache['attn_output']
        add1 = self.cache['add1']
        norm1 = self.cache['norm1']
        ff_output = self.cache['ff_output']

        # backward through norm2
        dadd2, dgamma2, dbeta2 = self.norm2.backward(dout)

        # gradient split residual connection
        dnorm1 = dadd2
        dff_output = dadd2

        # backward through feed-forward
        dw1, db1, dw2, db2, dnorm1_ff = self.ff.backward(dff_output)

        # add gradients from residual connection and ff
        dnorm1 += dnorm1_ff

        # backward through norm1
        dadd1 , dgamma1, dbeta1 = self.norm1.backward(dnorm1)

        # gradient split residual connection
        dx = dadd1
        dattn_output = dadd1

        # backward through attention
        dwq, dwk, dwv, dwo, dx_att = self.attention.backward(dattn_output)

        # add gradients from residual connection and attention
        dx += dx_att

        return {
            'wq': dwq,
            'wk': dwk,
            'wv': dwv,
            'wo': dwo,
            'w1': dw1,
            'b1': db1,
            'w2': dw2,
            'b2': db2,
            'dgamma1': dgamma1,
            'dbeta1': dbeta1,
            'dgamma2': dgamma2,
            'dbeta2': dbeta2,
            'dx': dx
        }

In [87]:
class ClassifierHead:
    def __init__(self, d_model , num_classes=2):
        self.w_cls = np.random.rand(d_model, num_classes) * 0.01
        self.b_cls = np.zeros(num_classes)
        self.cache = {}
    
    def forward(self, x, cache=True):
        # take cls per batch
        cls_output = x[:, 0, :]

        # logits of that cls
        logits = cls_output @ self.w_cls + self.b_cls

        if cache:
            self.cache['cls_output'] = cls_output
            self.cache['logits'] = logits
            self.cache['x'] = x

        return logits

    def softmax(self, x):
        x -= np.max(x, axis=-1, keepdims=True)
        exp_x = np.exp(x)
        return exp_x/np.sum(exp_x, axis=-1, keepdims=True + 1e-9) # add eps for stability
    
    def predict(self, x):
        logits = self.forward(x)
        softmax = self.softmax(logits)
        return np.argmax(softmax, axis=-1)
    
    def binary_loss_entropy(self, logits, labels):
        batch_size = logits.shape[0]
        probs = self.softmax(logits)

        # store for backward propagation
        self.cache['probs'] = probs
        self.cache['labels'] = labels

        # one hot encoder labels
        y_one_hot = np.zeros(probs)
        for i in range(batch_size):
            y_one_hot[i, labels[i]] = 1
        
        # cross entropy loss
        loss = -np.sum(y_one_hot * np.log(probs + 1e-9)) / batch_size # sum to entire batch
        return loss
    
    def backward(self):
        # get from cache
        probs = self.cache['probs']
        labels = self.cache['labels']
        cls_output = self.cache['cls_output']
        x = self.cache['x']

        batch_size = probs.shape[0]

        # gradient of cross entropy wrt softmax output
        dprobs = probs.copy()
        # substract 1 from true class probability
        for i in range(batch_size):
            dprobs[i, labels[i]] -= 1
        dprobs /= batch_size

        # gradient wrt weights and biases
        dw_cls = cls_output.T @ dprobs
        db_cls = np.sum(dprobs, axis=0)

        # gradient wrt cls output x
        dcls_output = dprobs @ self.w_cls.T

        # gradient wrt full bert output (only cls)
        dx = np.zeros_like(x)
        dx[:, 0, :] = dcls_output

        return dw_cls, db_cls, dx

In [88]:
class BERTSentimentClassifier:
    def __init__(self, max_len=10, d_model=16, num_heads=4):
        # parameters
        self.max_len = max_len
        self.d_model = d_model
        self.num_heads = num_heads
        # vocab
        self.vocab = Vocab()
        # embedding
        self.embedding = None
        # psositional encoding
        self.pos_encoding = PositionalEncoder(max_len, d_model)
        # bert model
        self.model = BertModel(d_model, num_heads)
        # classifier head
        self.classifier = ClassifierHead(d_model)
    
    def intialize(self, sentences):
        # build vocab
        self.vocab.build_vocab(sentences)
        # initialize embedding with vocab size
        vocab_size = len(self.vocab.vocab) + 1
        self.embedding = EmbeddingLayer(vocab_size, self.d_model)
    
    def prepare_input(self, sentences):
        # convert sentences to tokens to indices
        input_ids = [self.vocab.tokenize(sentence[0]) for sentence in sentences]
        input_ids = np.array(input_ids)
        labels = [label for _, label in sentences]
        labels = np.array(labels)
        return input_ids, labels
    
    def forward(self, input_ids, cache=True):
        # Embedding Layer
        embeddings = self.embedding.forward(input_ids, cache)
        # Positional Encoding
        pe = self.pos_encoding.forward(embeddings)
        # Bert Model
        bert_output = self.model.forward(pe, cache)
        # Classifier Head
        logits = self.classifier.forward(bert_output, cache)
        return logits
    
    def backward(self, logits, labels):
        # loss and gradient through classifier
        loss = np.mean(self.classifier.binary_loss_entropy(logits, labels))

        # bakprop through classifier
        dw_cls, db_cls, dbert_output = self.classifier.backward()

        # backprop through bert model
        dbert_grads = self.model.backward(dbert_output)

        # backprop through embedding layer
        dembedding_matrix = self.embedding.backward(dbert_grads['dx'])

        # collect all gradients for return
        grads = {
            'dw_cls': dw_cls,
            'db_cls': db_cls,
            'dembedding_matrix': dembedding_matrix,
            'dwq': dbert_grads['dwq'],
            'dwk': dbert_grads['dwk'],
            'dwv': dbert_grads['dwv'],
            'dwo': dbert_grads['dwo'],
            'dw1': dbert_grads['dw1'],
            'db1': dbert_grads['db1'],
            'dw2': dbert_grads['dw2'],
            'db2': dbert_grads['db2'],
            'dgamma1': dbert_grads['dgamma1'],
            'dbeta1': dbert_grads['dbeta1'],
            'dgamma2': dbert_grads['dgamma2'],
            'dbeta2': dbert_grads['dbeta2']
        }

        return loss, grads

    def update_parameters(self, grads, lr=0.01):
        # update classifier weigth parameter
        self.classifier.w_cls -= lr*grads['dw_cls']
        self.classifier.b_cls -= lr*grads['db_cls']

        # update embedding weight parameter
        self.embedding.embedding_matrix -= lr*grads['dembedding_matrix']

        # update bert parameters
        # update attention weight parameter
        self.model.attention.w_q -= lr*grads['bert']['dw_q']
        self.model.attention.w_k -= lr*grads['bert']['dw_k']
        self.model.attention.w_v -= lr*grads['bert']['dw_v']

        # update layer norm
        self.model.norm1.gamma -= lr * grads['dgamma1']
        self.model.norm1.beta -= lr * grads['dbeta1']
        self.model.norm2.gamma -= lr * grads['dgamma2']
        self.model.norm2.beta -= lr * grads['dbeta2']

        # update ffn weight parameter
        self.model.ff.w1 -= lr*grads['bert']['w1']
        self.model.ff.b1 -= lr*grads['bert']['b1']
        self.model.ff.w2 -= lr*grads['bert']['w2']
        self.model.ff.b2 -= lr*grads['bert']['b2']

    def train_step(self, batch, lr=0.01):
        # prepare batch data
        input_ids, labels = self.prepare_input(batch)

        # forward pass
        logits = self.forward(input_ids)

        # backward pass
        loss, grads = self.backward(logits, labels)

        # update parameters
        self.update_parameters(grads, lr)

        return loss

    def evaluate(self, sentences):
        input_ids, labels = self.prepare_input(sentences)
        logits = self.forward(input_ids, cache=False)
        predictions = np.argmax(self.classifier.softmax(logits), axis=1)
        accuracy = np.mean(predictions == labels)
        return accuracy

In [90]:
# Example
if __name__ == "__main__":
    sentences = [
        ('aku suka film ini', 1),
        ('aku tidak suka film ini', 0),
        ('film ini bagus sekali', 1),
        ('film ini sangat buruk', 0),
    ]

    # Create and initialize model
    model = BERTSentimentClassifier(max_len=10, d_model=16, num_heads=4)
    model.initialize(sentences)

    # Training loop
    num_epochs = 1000
    learning_rate = 0.01
    
    for epoch in range(num_epochs):
        loss = model.train_step(sentences, lr=learning_rate)
        
        if epoch % 100 == 0:
            accuracy = model.evaluate(sentences)
            print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

TypeError: LayerNorm.__init__() missing 1 required positional argument: 'd_model'