In [1]:
import re
import numpy as np
from gensim.models import Word2Vec

text = "The animal did not cross the street because it was tired."
tokens = re.findall(r"\w+", text.lower())

sentences = [tokens]

word2vec_model = Word2Vec(
    sentences,
    vector_size=64,
    window=3,
    min_count=1,
    sg=1
)

X = np.array([word2vec_model.wv[word] for word in tokens])  # (seq_len, d_model)

def positional_encoding(seq_len, d_model):
    PE = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            PE[pos, i] = np.sin(pos / (10000 ** (i/d_model)))
            PE[pos, i+1] = np.cos(pos / (10000 ** (i/d_model)))
    return PE

X = X + positional_encoding(len(tokens), 64)

d_model = 64
num_heads = 8
d_k = d_v = d_model // num_heads
seq_len = X.shape[0]

W_Q = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_K = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_V = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_O = np.random.randn(d_model, d_model) / np.sqrt(d_model)

Q = np.matmul(X, W_Q)
K = np.matmul(X, W_K)
V = np.matmul(X, W_V)

Q = Q.reshape(seq_len, num_heads, d_k).transpose(1,0,2)
K = K.reshape(seq_len, num_heads, d_k).transpose(1,0,2)
V = V.reshape(seq_len, num_heads, d_v).transpose(1,0,2)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

# Attention per head

heads = []
attention_weights = []

for h in range(num_heads):
    scores = np.matmul(Q[h], np.transpose(K[h]))
    scores = scores / np.sqrt(d_k)
    weights = softmax(scores)
    attention_weights.append(weights)
    head = np.matmul(weights, V[h])
    heads.append(head)

attention_weights = np.array(attention_weights)

concat = np.concatenate(heads, axis=-1)

output = np.matmul(concat, W_O)

residual_1 = X + output

def layer_norm(x,   eps=1e-6):
    mean = np.mean(x, axis=-1, keepdims = True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + eps)

Z1 = layer_norm(residual_1)


d_model = 64  # input layer
d_ff = 256  # hidden layer

W1 = np.random.randn(d_model, d_ff) / np.sqrt(d_model)
b1 = np.zeros(d_ff)

W2 = np.random.randn(d_ff, d_model) / np.sqrt(d_ff)
b2 = np.zeros(d_model)

def relu(x):
    return np.maximum(0,x)

def feed_forward(x):
    hidden = relu(np.matmul(x, W1) + b1)
    output = np.matmul(hidden, W2) + b2
    return output

ffn_output = feed_forward(Z1)

residual_2 = Z1 + ffn_output
Z2 = layer_norm(residual_2)

In [5]:
# Causal Mask Matrix

def causal_mask(seq_len):
    mask = np.triu(np.ones((seq_len, seq_len)), k=1)
    return mask * -1e9

In [8]:
seq_len = Q.shape[1]   # or len(tokens)
mask = causal_mask(seq_len)

In [9]:
mask.shape == (seq_len, seq_len)

True

In [10]:
scores = np.matmul(Q[h], K[h].T)
scores = scores / np.sqrt(d_k)
scores = scores + mask   # âœ… now this works
weights = softmax(scores)