In [None]:
import numpy as np

def data_for_nsp(corpus):
    """
    Prepare data for NSP task by splitting corpus into sentences.
    """
    # Split corpus into sentences
    sentences = [s.strip() for s in corpus.split('.') if s.strip()]
    
    # Add special tokens to vocabulary
    all_words = ' '.join(sentences).split()
    vocab = sorted(list(set(all_words)))
    vocab.extend(['[CLS]', '[SEP]', '[MASK]'])
    
    return sentences, vocab

def create_sentence_pairs(sentences, positive_ratio=0.5):
    """
    Create sentence pairs for NSP task.
    - positive_pairs: consecutive sentences (IsNext label=1)
    - negative_pairs: random sentences (NotNext label=0)
    """
    pairs = []
    labels = []
    
    # Create positive pairs (consecutive sentences)
    for i in range(len(sentences) - 1):
        pairs.append((sentences[i], sentences[i + 1]))
        labels.append(1)  # IsNext
    
    # Create negative pairs (random sentences)
    num_positive = len(pairs)
    num_negative = int(num_positive * (1 - positive_ratio) / positive_ratio)
    
    for _ in range(num_negative):
        idx1 = np.random.randint(0, len(sentences))
        idx2 = np.random.randint(0, len(sentences))
        # Make sure sentences are not consecutive
        while abs(idx1 - idx2) == 1:
            idx2 = np.random.randint(0, len(sentences))
        
        pairs.append((sentences[idx1], sentences[idx2]))
        labels.append(0)  # NotNext
    
    return pairs, labels

def tokenizer(vocab):
    """
    Create token-to-index and index-to-token mappings.
    """
    token2idx = {}
    idx2token = {}
    for idx, token in enumerate(vocab):
        token2idx[token] = idx
        idx2token[idx] = token
    return token2idx, idx2token

def prepare_nsp_input(sentence_pair, token2idx, max_seq_len=128):
    """
    Prepare input for NSP task:
    [CLS] sentence1 [SEP] sentence2 [SEP]
    """
    tokens = ['[CLS]']
    
    # Add first sentence
    tokens.extend(sentence_pair[0].split())
    tokens.append('[SEP]')
    
    # Add second sentence
    tokens.extend(sentence_pair[1].split())
    tokens.append('[SEP]')
    
    # Convert to token ids
    token_ids = []
    for token in tokens:
        if token in token2idx:
            token_ids.append(token2idx[token])
        else:
            # Handle OOV (Out of Vocabulary)
            token_ids.append(token2idx['[MASK]'])  # Use [MASK] as OOV token
    
    # Create segment ids (0 for first sentence, 1 for second sentence)
    segment_ids = []
    segment = 0
    for token in tokens:
        segment_ids.append(segment)
        if token == '[SEP]':
            segment = 1
    
    # Pad or truncate to max_seq_len
    seq_len = len(token_ids)
    if seq_len > max_seq_len:
        token_ids = token_ids[:max_seq_len]
        segment_ids = segment_ids[:max_seq_len]
    else:
        token_ids.extend([0] * (max_seq_len - seq_len))
        segment_ids.extend([0] * (max_seq_len - seq_len))
    
    return token_ids, segment_ids, min(seq_len, max_seq_len)

def embedding(scale, d_model, vocab_size):
    """
    Create token embedding matrix.
    """
    np.random.seed(42)
    embed_matrix = np.random.rand(vocab_size, d_model) * scale
    return embed_matrix

def segment_embedding(scale, d_model, num_segments=2):
    """
    Create segment embedding matrix.
    """
    np.random.seed(43)
    segment_embed_matrix = np.random.rand(num_segments, d_model) * scale
    return segment_embed_matrix

def position_embedding(max_seq_len, d_model):
    """
    Create position embeddings using sine and cosine functions.
    """
    pos = np.arange(max_seq_len).reshape(max_seq_len, 1)
    i = np.arange(d_model)
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    pe = np.zeros((max_seq_len, d_model))
    pe[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pe[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    return pe

def embedding_output(token_ids, segment_ids, token_embed_matrix, segment_embed_matrix, pos_embed_matrix, seq_len):
    """
    Combine token, segment, and position embeddings.
    """
    token_embeds = token_embed_matrix[token_ids[:seq_len]]
    segment_embeds = segment_embed_matrix[segment_ids[:seq_len]]
    position_embeds = pos_embed_matrix[:seq_len]
    
    return token_embeds + segment_embeds + position_embeds

def attention_weights(d_model):
    """
    Initialize attention weights.
    """
    Wq = np.random.randn(d_model, d_model) * 0.01
    Wk = np.random.randn(d_model, d_model) * 0.01
    Wv = np.random.randn(d_model, d_model) * 0.01

    return Wq, Wk, Wv

def attention_output(x, Wq, Wk, Wv, mask=None):
    """
    Calculate self-attention output.
    """
    Q = x @ Wq
    K = x @ Wk
    V = x @ Wv
    d_k = Q.shape[-1]
    
    # Attention scores
    scaled = np.matmul(Q, K.transpose()) / np.sqrt(d_k)
    
    # Apply mask if provided
    if mask is not None:
        scaled = np.where(mask == 0, -1e9, scaled)
    
    # Softmax
    softmax = np.exp(scaled) / np.sum(np.exp(scaled), axis=-1, keepdims=True)

    return np.matmul(softmax, V)

def add_and_norm(x, attention_output):
    """
    Add and normalize (LayerNorm).
    """
    eps = 1e-6
    avg = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    norm = (x - avg) / (std + eps)
    norm_output = attention_output + norm
    return norm_output
    
def ffn_weights(d_model, d_ff):
    """
    Initialize feed-forward network weights.
    """
    W1 = np.random.randn(d_model, d_ff) * 0.01
    W2 = np.random.randn(d_ff, d_model) * 0.01
    return W1, W2

def ffn_output(norm_output, W1, W2):
    """
    Feed-forward network output.
    """
    ffn_output = np.matmul(norm_output, W1)
    relu = np.maximum(0, ffn_output)
    ffn_output = np.matmul(relu, W2)
    return ffn_output

def nsp_classifier_weights(d_model):
    """
    Initialize NSP classifier weights.
    """
    W_nsp = np.random.randn(d_model, 2) * 0.01  # 2 classes: IsNext and NotNext
    b_nsp = np.zeros(2)
    return W_nsp, b_nsp

def initialize_model(d_model, d_ff, vocab_size, max_seq_len):
    """
    Initialize all model parameters.
    """
    Wq, Wk, Wv = attention_weights(d_model)
    W1, W2 = ffn_weights(d_model, d_ff)
    token_embed_matrix = embedding(0.01, d_model, vocab_size)
    segment_embed_matrix = segment_embedding(0.01, d_model)
    pos_embed_matrix = position_embedding(max_seq_len, d_model)
    W_nsp, b_nsp = nsp_classifier_weights(d_model)
    
    return {
        'Wq': Wq, 'Wk': Wk, 'Wv': Wv,
        'W1': W1, 'W2': W2,
        'token_embed_matrix': token_embed_matrix,
        'segment_embed_matrix': segment_embed_matrix,
        'pos_embed_matrix': pos_embed_matrix,
        'W_nsp': W_nsp, 'b_nsp': b_nsp
    }

def forward_pass(token_ids, segment_ids, seq_len, model_params, max_seq_len):
    """
    Forward pass through the model for NSP task.
    """
    # Embedding layer
    embed_output = embedding_output(
        token_ids, segment_ids,
        model_params['token_embed_matrix'],
        model_params['segment_embed_matrix'],
        model_params['pos_embed_matrix'],
        seq_len
    )
    
    # Create padding mask to prevent attention to padding tokens
    padding_mask = np.ones((seq_len, seq_len))
    for i in range(seq_len):
        for j in range(seq_len):
            if i >= seq_len or j >= seq_len:
                padding_mask[i, j] = 0
    
    # Self-attention layer
    attn_output = attention_output(
        embed_output, model_params['Wq'], model_params['Wk'], model_params['Wv'],
        mask=padding_mask
    )
    norm_output = add_and_norm(embed_output, attn_output)
    
    # Feed-forward network
    ffn_out = ffn_output(norm_output, model_params['W1'], model_params['W2'])
    ffn_norm_output = add_and_norm(norm_output, ffn_out)
    
    # NSP classifier - use [CLS] representation
    cls_output = ffn_norm_output[0]  # [CLS] token is the first token
    nsp_logits = np.matmul(cls_output, model_params['W_nsp']) + model_params['b_nsp']
    nsp_probs = np.exp(nsp_logits) / np.sum(np.exp(nsp_logits))
    
    return nsp_probs, cls_output

def nsp_loss(probs, label):
    """
    Calculate NSP loss.
    """
    return -np.log(probs[label] + 1e-10)

def backward_propagation(loss, probs, label, cls_output, model_params, lr=0.01):
    """
    Backward propagation for NSP task.
    """
    # Gradients for NSP classifier
    d_W_nsp = np.zeros_like(model_params['W_nsp'])
    d_b_nsp = np.zeros_like(model_params['b_nsp'])
    
    d_probs = np.zeros_like(probs)
    d_probs[label] = -1.0 / (probs[label] + 1e-10)
    
    d_logits = probs * d_probs
    
    d_W_nsp += np.outer(cls_output, d_logits)
    d_b_nsp += d_logits
    
    # Update parameters
    model_params['W_nsp'] -= lr * d_W_nsp
    model_params['b_nsp'] -= lr * d_b_nsp
    
    return model_params

def train_nsp(corpus, n_epoch=1000, max_seq_len=128):
    """
    Train the model on the NSP task.
    """
    # Hyperparameters
    d_model = 16
    d_ff = 64
    
    # Prepare data
    sentences, vocab = data_for_nsp(corpus)
    vocab_size = len(vocab)
    token2idx, idx2token = tokenizer(vocab)
    
    # Create sentence pairs and labels
    pairs, labels = create_sentence_pairs(sentences)
    
    # Initialize model
    model_params = initialize_model(d_model, d_ff, vocab_size, max_seq_len)
    
    # Training loop
    losses = []
    accuracies = []
    
    for epoch in range(n_epoch):
        epoch_loss = 0
        correct = 0
        
        for i in range(len(pairs)):
            # Prepare input
            token_ids, segment_ids, seq_len = prepare_nsp_input(pairs[i], token2idx, max_seq_len)
            label = labels[i]
            
            # Forward pass
            nsp_probs, cls_output = forward_pass(token_ids, segment_ids, seq_len, model_params, max_seq_len)
            
            # Calculate loss
            loss_value = nsp_loss(nsp_probs, label)
            epoch_loss += loss_value
            
            # Backward propagation
            model_params = backward_propagation(loss_value, nsp_probs, label, cls_output, model_params, lr=0.01)
            
            # Calculate accuracy
            predicted = np.argmax(nsp_probs)
            if predicted == label:
                correct += 1
        
        # Average loss and accuracy for the epoch
        avg_loss = epoch_loss / len(pairs)
        accuracy = correct / len(pairs)
        
        losses.append(avg_loss)
        accuracies.append(accuracy)
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch {epoch+1}/{n_epoch}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    # Evaluate on a few examples
    print("\nEvaluation on sample pairs:")
    for i in range(min(5, len(pairs))):
        sent1, sent2 = pairs[i]
        token_ids, segment_ids, seq_len = prepare_nsp_input(pairs[i], token2idx, max_seq_len)
        nsp_probs, _ = forward_pass(token_ids, segment_ids, seq_len, model_params, max_seq_len)
        predicted = np.argmax(nsp_probs)
        
        print(f"Sentence 1: {sent1}")
        print(f"Sentence 2: {sent2}")
        print(f"Prediction: {'IsNext' if predicted == 1 else 'NotNext'}")
        print(f"Ground truth: {'IsNext' if labels[i] == 1 else 'NotNext'}")
        print(f"Confidence: {nsp_probs[predicted]:.4f}")
        print("-" * 50)
    
    return model_params, losses, accuracies

# Example usage
if __name__ == "__main__":
    corpus = """The dog went for a walk. It was a sunny day. The birds were singing in the trees. 
    A cat watched from a window. The flowers were blooming in the garden. 
    Children played in the park. A squirrel ran up a tree. The ice cream truck arrived."""
    
    model_params, losses, accuracies = train_nsp(corpus, n_epoch=500)

In [None]:
tokens = set()
for s1, s2, _ in dataset:
    tokens.update(s1.lower().split())
    tokens.update(s2.lower().split())

tokens = sorted(list(tokens))
vocab = {w: i+2 for i, w in enumerate(tokens)}  # 0 = PAD, 1 = CLS
vocab["[PAD]"] = 0
vocab["[CLS]"] = 1
vocab["[SEP]"] = len(vocab)


In [None]:
s1 = "Aku pergi ke toko".lower().split()
s2 = "Lalu aku membeli roti".lower().split()

input_ids = [vocab["[CLS]"]] + [vocab[w] for w in s1] + [vocab["[SEP]"]] + [vocab[w] for w in s2] + [vocab["[SEP]"]]
max_len = 16

if len(input_ids) < max_len:
    input_ids += [vocab["[PAD]"]] * (max_len - len(input_ids))
else:
    input_ids = input_ids[:max_len]

input_ids = np.array(input_ids)


In [None]:
# Kita pakai Mini-BERT yang sebelumnya sudah kita bangun
hidden_states = mini_bert_forward(input_ids)

# Ambil output vector dari token [CLS] (posisi pertama)
cls_vector = hidden_states[0]


In [None]:
hidden_dim = cls_vector.shape[0]  # misal 32 atau 64, tergantung BERT

W = np.random.randn(hidden_dim, 2) * 0.01
b = np.zeros(2)

logits = cls_vector @ W + b

# Softmax
probs = np.exp(logits) / np.sum(np.exp(logits))
pred = np.argmax(probs)
