<a href="https://colab.research.google.com/github/SrihariR2004/Basics_Transformers/blob/main/Transformers_QKV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import math
import string

In [2]:
sentences = [
    "I love coding.",
    "You enjoy AI.",
    "AI is amazing.",
    "Coding is fun.",
    "I enjoy learning."
]

# Parameters


In [43]:
embedding_dim = 4  # Embedding size
head_dim = 2       # Attention head size
np.random.seed(42)  # To make the example reproducible


# Random embeddings for each word

In [42]:
def generate_embeddings(word):
    return np.random.rand(embedding_dim)

In [60]:
vocabulary = [
    "I", "love", "coding", "You", "enjoy", "AI", "is", "amazing", "fun", "learning"
]

#**Step 1: Preprocess the sentences (tokenization & embeddings)**

In [44]:
def preprocess_sentence(sentence):
    words = sentence.split()
    return np.array([generate_embeddings(word) for word in words])

# Convert sentences to embeddings (list of matrices)

In [45]:
X = [preprocess_sentence(sentence) for sentence in sentences]

# Random weight matrices for Q, K, V

In [46]:
W_q = np.random.rand(embedding_dim, head_dim)  # Q matrix
W_k = np.random.rand(embedding_dim, head_dim)  # K matrix
W_v = np.random.rand(embedding_dim, head_dim)  # V matrix

#Output weight matrix

In [47]:
W_vocab = np.random.rand(head_dim, embedding_dim)

# **Step 2: Apply self-attention mechanism to each sentence**

In [49]:
def apply_attention(X):
    Q = X @ W_q  # shape (num_tokens, head_dim)
    K = X @ W_k  # shape (num_tokens, head_dim)
    V = X @ W_v  # shape (num_tokens, head_dim)
    # Compute attention scores (scaled dot-product attention)
    scores = Q @ K.T / math.sqrt(head_dim)  # shape (num_tokens, num_tokens)
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)  # softmax

    # Compute the attention output
    attention_output = attention_weights @ V  # shape (num_tokens, head_dim)

    return attention_output

# **Step 3: Perform attention on each sentence and predict next word**

In [52]:
for i, sentence_embeddings in enumerate(X):
    attention_output = apply_attention(sentence_embeddings)
    final_vector = attention_output[-1]  # Use the output of the last word in the sentence

    # Compute logits for the next word prediction (map back to original word space)
    # Original: logits = final_vector @ W_vocab.T  # shape (embedding_dim,)
    # Changed: Transpose final_vector to match W_vocab's dimensions for multiplication

    logits = final_vector.T @ W_vocab # shape (embedding_dim,)
    probs = np.exp(logits) / np.sum(np.exp(logits))  # softmax to get probabilities

    # Show results
    print(f"Sentence: {sentences[i]}")
    print("Next word probabilities (normalized):")
    print(probs)

    # Since we don't have a vocabulary list here, we'll pick the highest-probability word
    predicted_word = f"Predicted Next Word (index {np.argmax(probs)})"
    print(f"Predicted Next Word: {predicted_word}\n")

Sentence: I love coding.
Next word probabilities (normalized):
[0.22524353 0.19621359 0.27253067 0.30601221]
Predicted Next Word: Predicted Next Word (index 3)

Sentence: You enjoy AI.
Next word probabilities (normalized):
[0.229503   0.2066897  0.26997677 0.29383052]
Predicted Next Word: Predicted Next Word (index 3)

Sentence: AI is amazing.
Next word probabilities (normalized):
[0.2345632  0.18701278 0.2558757  0.32254833]
Predicted Next Word: Predicted Next Word (index 3)

Sentence: Coding is fun.
Next word probabilities (normalized):
[0.23592466 0.20990959 0.26207982 0.29208592]
Predicted Next Word: Predicted Next Word (index 3)

Sentence: I enjoy learning.
Next word probabilities (normalized):
[0.2232394  0.18291714 0.26983762 0.32400583]
Predicted Next Word: Predicted Next Word (index 3)



In [58]:
def predict_next_word(sentence_embeddings):
    attention_output = apply_attention(sentence_embeddings)
    final_vector = attention_output[-1]  # Use the output of the last word in the sentence

    # Transpose final_vector instead of W_vocab to match the dimensions
    logits = final_vector.T @ W_vocab  # shape (embedding_dim,)

    probs = np.exp(logits) / np.sum(np.exp(logits))  # softmax to get probabilities

    # Return the index of the word with the highest probability
    return np.argmax(probs)

# **Step 4: Predict for a new sentence**

In [65]:
new_sentence = "I enjoy coding"

# Preprocess the new sentence (convert to embeddings)
new_sentence_embeddings = preprocess_sentence(new_sentence)

# Predict next word for the new sentence
predicted_index = predict_next_word(new_sentence_embeddings)

# Output the result as a word (instead of index)
predicted_word = vocabulary[predicted_index]

# Display the results
print(f"New Sentence: {new_sentence}")
print(f"Predicted Next Word: {predicted_word}")

New Sentence: I enjoy coding
Predicted Next Word: You
