In [1]:
# Imports
import numpy as np

# Model dimension
dim_model = 64

# Sequence length
seq_length = 10

# Vocabulary size
vocab_size = 100

In [7]:
# Define the function to create an embedding matrix
def embedding(input, vocab_size, dim_model):

    # Create an embedding matrix where each row represents a vocabulary token
    # The array is initialized with normally distributed random values
    embed = np.random.randn(vocab_size, dim_model)

    # For each token index in the input, select the corresponding embedding from the array
    # Returns an array of embeddings corresponding to the input sequence
    return np.array([embed[i] for i in input])

In [2]:
# Softmax Activation Function
def softmax(x):

    # Calculates the exponential of each input element, adjusted by the maximum value in the input
    # to avoid numeric overflow
    e_x = np.exp(x - np.max(x))

    # Divide each exponential by the sum of the exponentials along the last axis (axis=-1)
    # Reshape(-1, 1) ensures that division is performed correctly in a multidimensional context
    return e_x / e_x.sum(axis=-1).reshape(-1, 1)

In [3]:
# Define the function to calculate attention scaled by dot product
def scaled_dot_product_attention(Q, K, V):

    # Calculate the dot product between Q and the transpose of K
    matmul_qk = np.dot(Q, K.T)

    # Gets the dimension of the key vectors
    depth = K.shape[-1]

    # Scale the logits by dividing them by the square root of the depth
    logits = matmul_qk / np.sqrt(depth)

    # Apply the softmax function to obtain the attention weights
    attention_weights = softmax(logits)

    # Multiply the attention weights by the V values to get the final output
    output = np.dot(attention_weights, V)

    # Returns the weighted output
    return output

In [4]:
# Defines the function that applies a linear transformation followed by softmax
def linear_and_softmax(input):

    # Initialize a weight matrix with normally distributed random values
    # This matrix connects each model dimension (dim_model) to each vocabulary word (vocab_size)
    weights = np.random.randn(dim_model, vocab_size)

    # Performs the linear operation (scalar product) between the input and the weight matrix
    # The result, logits, is a vector that represents the input transformed into a higher-dimensional space
    logits = np.dot(input, weights)

    # Apply the softmax function to the logits
    # This transforms the logits into a vector of probabilities, where each element sums to 1
    return softmax(logits)

In [5]:
# Final model function
def transformer_model(input):

    # Embedding
    embedded_input = embedding(input, vocab_size, dim_model)

    # Attention Mechanism
    attention_output = scaled_dot_product_attention(embedded_input, embedded_input, embedded_input)

    # Layer linear and softmax
    output_probabilities = linear_and_softmax(attention_output)

    # Choosing the indices with the highest probability
    output_indices = np.argmax(output_probabilities, axis=-1)

    return output_indices

In [8]:
# Generating random data for model input
input_sequence = np.random.randint(0, vocab_size, seq_length)

print("Input Sequence:", input_sequence)

# Making predictions with the model
output = transformer_model(input_sequence)

print("Model Output:", output)

Input Sequence: [85 60 27 87 46 12  7 61 85 74]
Model Output: [36 18 55 70  1 28 62  5 36 22]
