In [26]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dot, Activation, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import random
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA # For dimensionality reduction
import matplotlib.pyplot as plt # For plotting

In [27]:
# Sample Corpus
corpus = [
    "natural language processing is fun and interesting",
    "implementing skipgram in python is a good exercise",
    "word embeddings capture semantic relationships",
    "skipgram with negative sampling is efficient for large vocabularies",
    "language models are powerful",
    "python is a versatile programming language"
]


In [28]:
# 1. Data Preparation

# Tokenization and Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(corpus)

In [29]:
# Flatten the sequences to easily pick target words
flat_sequence = [word for sublist in sequences for word in sublist]

In [30]:
# Generate Target-Context and Negative Samples
def generate_skipgram_pairs(sequence, window_size, num_negative_samples, vocab_size):
    positive_pairs = []
    labels = []
    for i, target_word_id in enumerate(sequence):
        context_window_start = max(0, i - window_size)
        context_window_end = min(len(sequence), i + window_size + 1)
        for j in range(context_window_start, context_window_end):
            if i != j:
                context_word_id = sequence[j]
                # Positive pair
                positive_pairs.append([target_word_id, context_word_id])
                labels.append(1)

                # Negative samples
                for _ in range(num_negative_samples):
                    negative_word_id = random.randint(1, vocab_size - 1) # Exclude padding index 0
                    while negative_word_id == target_word_id or negative_word_id in sequence[context_window_start:context_window_end]:
                         negative_word_id = random.randint(1, vocab_size - 1)
                    positive_pairs.append([target_word_id, negative_word_id])
                    labels.append(0)
    return np.array(positive_pairs), np.array(labels)

In [31]:

window_size = 2
num_negative_samples = 4 # Number of negative samples per positive pair

In [32]:
# Generate training data
X, y = generate_skipgram_pairs(flat_sequence, window_size, num_negative_samples, vocab_size)

In [33]:
# Separate target and context/negative words for model input
X_target = X[:, 0]
X_context = X[:, 1]

In [34]:
# 2. Model Architecture

embed_size = 10  # Dimension of word embeddings

# Input layers for target and context/negative words
input_target = Input(shape=(1,))
input_context = Input(shape=(1,))

In [35]:
# Embedding layer for target words
# Use separate embedding matrices for target and context for better performance
embedding_layer_target = Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=1, name='target_embedding')
embedding_target = embedding_layer_target(input_target)
embedding_target = Reshape((embed_size,))(embedding_target) # Flatten the embedding

In [36]:
# Embedding layer for context/negative words
embedding_layer_context = Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=1, name='context_embedding')
embedding_context = embedding_layer_context(input_context)
embedding_context = Reshape((embed_size,))(embedding_context) # Flatten the embedding


In [37]:
# Compute the dot product of the target and context/negative embeddings
dot_product = Dot(axes=-1)([embedding_target, embedding_context])

In [38]:
# Sigmoid activation to get the probability
output = Activation('sigmoid')(dot_product)

In [39]:
# Define the model
model = Model(inputs=[input_target, input_context], outputs=output)

In [40]:
# 3. Training with Negative Sampling

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [41]:
# Train the model
# We train on the generated pairs with binary labels (1 for positive, 0 for negative)
model.fit([X_target, X_context], y, epochs=50, verbose=1)

Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5145 - loss: 0.6931
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6768 - loss: 0.6925
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7579 - loss: 0.6918 
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7967 - loss: 0.6907 
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8364 - loss: 0.6882
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8310 - loss: 0.6855
Epoch 7/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8317 - loss: 0.6802
Epoch 8/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8344 - loss: 0.6727
Epoch 9/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7d3fd63a6690>

In [42]:

# Get Word Embeddings
# You can get the learned embeddings from the target embedding layer
target_embedding_matrix = model.get_layer('target_embedding').get_weights()[0]

In [43]:
# Function to get the embedding for a word
def get_word_embedding(word, word_index, embedding_matrix):
    if word in word_index:
        return embedding_matrix[word_index[word]]
    else:
        return None

In [44]:
# Example Usage (Cosine Similarity) - Requires the function from the previous response
from scipy.spatial.distance import cosine

def calculate_cosine_similarity(word1, word2, word_index, embedding_matrix):
    embedding1 = get_word_embedding(word1, word_index, embedding_matrix)
    embedding2 = get_word_embedding(word2, word_index, embedding_matrix)

    if embedding1 is not None and embedding2 is not None:
        return 1 - cosine(embedding1, embedding2)
    else:
        return None


In [45]:
word1 = 'python'
word2 = 'skipgram'
similarity = calculate_cosine_similarity(word1, word2, word_index, target_embedding_matrix)

if similarity is not None:
    print(f"\nCosine similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"\nOne or both words ('{word1}', '{word2}') not in vocabulary.")


Cosine similarity between 'python' and 'skipgram': 0.767221987247467


In [46]:

word3 = 'language'
word4 = 'embeddings'
similarity2 = calculate_cosine_similarity(word3, word4, word_index, target_embedding_matrix)

if similarity2 is not None:
     print(f"Cosine similarity between '{word3}' and '{word4}': {similarity2}")
else:
    print(f"\nOne or both words ('{word3}', '{word4}') not in vocabulary.")

# Note: Similar to CBOW, training on a small corpus will result in
# less meaningful embeddings.

Cosine similarity between 'language' and 'embeddings': 0.19705462455749512


In [47]:
# 5. Evaluation

# Cosine Similarity Examples
print("\n--- Word Similarity Evaluation (Cosine Similarity) ---")
word_pairs = [('python', 'skipgram'), ('language', 'embeddings'), ('processing', 'python'), ('fun', 'exercise')]
for word1, word2 in word_pairs:
    similarity = calculate_cosine_similarity(word1, word2, word_index, target_embedding_matrix)
    if similarity is not None:
        print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")
    else:
        print(f"One or both words ('{word1}', '{word2}') not in vocabulary.")



--- Word Similarity Evaluation (Cosine Similarity) ---
Cosine similarity between 'python' and 'skipgram': 0.7672
Cosine similarity between 'language' and 'embeddings': 0.1971
Cosine similarity between 'processing' and 'python': 0.7097
Cosine similarity between 'fun' and 'exercise': 0.3012


In [48]:
print("\n--- Other Evaluation Methods ---")
print("Beyond cosine similarity and visualization, word embeddings can be evaluated using:")
print("Intrinsic Evaluation:")
print("- Word Similarity Benchmarks: Comparing embedding similarity to human ratings on datasets (e.g., MEN, WS-353).")
print("- Word Analogy Tasks: Testing if vector relationships hold for analogies (e.g., 'king' - 'man' + 'woman' ≈ 'queen').")
print("- Word Categorization/Clustering: Assessing if semantically similar words group together.")
print("\nExtrinsic Evaluation:")
print("Evaluating embedding performance when used as features in downstream NLP tasks such as:")
print("- Text Classification (e.g., sentiment analysis)")
print("- Named Entity Recognition (NER)")
print("- Machine Translation")
print("The performance on these tasks indicates the practical utility of the embeddings.")




--- Other Evaluation Methods ---
Beyond cosine similarity and visualization, word embeddings can be evaluated using:
Intrinsic Evaluation:
- Word Similarity Benchmarks: Comparing embedding similarity to human ratings on datasets (e.g., MEN, WS-353).
- Word Analogy Tasks: Testing if vector relationships hold for analogies (e.g., 'king' - 'man' + 'woman' ≈ 'queen').
- Word Categorization/Clustering: Assessing if semantically similar words group together.

Extrinsic Evaluation:
Evaluating embedding performance when used as features in downstream NLP tasks such as:
- Text Classification (e.g., sentiment analysis)
- Named Entity Recognition (NER)
- Machine Translation
The performance on these tasks indicates the practical utility of the embeddings.
