In [93]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [94]:
# Sample sentence
text = "The quick brown fox jumps over the lazy dog"
words = text.lower().split()

# Initialize Tokenizer to convert words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id) + 1  # Adding 1 for padding index

# Convert sentence to a sequence of integer IDs
word_ids = [word2id[word] for word in words]

In [95]:
word_ids

[1, 2, 3, 4, 5, 6, 1, 7, 8]

In [96]:
# Generate Skip-gram pairs
window_size = 2  # Context window size
pairs, labels = skipgrams(word_ids, vocabulary_size=vocab_size, window_size=window_size)


print("Example pairs:")
for i in range(5):
    # target, context = pairs[i] - pairs[i][0]=> target and pairs[i][1] => context 
    print(f"({id2word[pairs[i][0]]}, {id2word[pairs[i][1]]}) -> {labels[i]}")


Example pairs:
(dog, fox) -> 0
(quick, fox) -> 1
(lazy, over) -> 0
(over, the) -> 0
(brown, jumps) -> 0


In [97]:
embedding_dim = 50  # Dimension of the embedding space
# Define inputs for the target and context words
target_input = Input(shape=(1,), dtype='int32')
context_input = Input(shape=(1,), dtype='int32')

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name="embedding_layer")
target_embedding = embedding(target_input)
context_embedding = embedding(context_input)




In [98]:
context_embedding.shape

(None, 1, 50)

In [99]:
target_embedding.shape

(None, 1, 50)

In [100]:
# Dot product to measure similarity between target and context embeddings
dot_product = Dot(axes=-1)([target_embedding, context_embedding])
dot_product = Reshape((1,))(dot_product)

# Sigmoid output layer to predict if the pair is a context pair
output = Dense(1, activation='sigmoid')(dot_product)

# Build and compile the model
model = Model(inputs=[target_input, context_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

print(model.summary())


None


In [101]:
# Access the initial weights of the embedding layer
initial_embeddings = model.get_layer("embedding_layer").get_weights()

In [102]:
len(initial_embeddings[0])

9

In [103]:
pairs

[[8, 4],
 [2, 4],
 [7, 6],
 [6, 1],
 [3, 5],
 [1, 3],
 [1, 5],
 [6, 2],
 [6, 1],
 [5, 2],
 [4, 3],
 [4, 4],
 [1, 6],
 [5, 7],
 [5, 3],
 [1, 8],
 [5, 4],
 [4, 7],
 [6, 2],
 [3, 4],
 [4, 5],
 [7, 4],
 [8, 4],
 [3, 4],
 [7, 1],
 [4, 7],
 [8, 1],
 [1, 3],
 [2, 8],
 [6, 5],
 [3, 5],
 [2, 1],
 [8, 7],
 [4, 2],
 [2, 4],
 [6, 4],
 [2, 3],
 [2, 7],
 [1, 2],
 [7, 6],
 [5, 6],
 [1, 6],
 [3, 2],
 [1, 6],
 [3, 1],
 [1, 2],
 [1, 3],
 [1, 7],
 [6, 7],
 [5, 6],
 [7, 8],
 [5, 2],
 [4, 6],
 [6, 2],
 [5, 1],
 [7, 3],
 [1, 6],
 [3, 8],
 [3, 4],
 [4, 5]]

In [104]:
# Prepare the inputs and labels for training
target_words = np.array([pair[0] for pair in pairs], dtype='int32')
context_words = np.array([pair[1] for pair in pairs], dtype='int32')

In [105]:
context_words

array([4, 4, 6, 1, 5, 3, 5, 2, 1, 2, 3, 4, 6, 7, 3, 8, 4, 7, 2, 4, 5, 4,
       4, 4, 1, 7, 1, 3, 8, 5, 5, 1, 7, 2, 4, 4, 3, 7, 2, 6, 6, 6, 2, 6,
       1, 2, 3, 7, 7, 6, 8, 2, 6, 2, 1, 3, 6, 8, 4, 5], dtype=int32)

In [106]:
target_words

array([8, 2, 7, 6, 3, 1, 1, 6, 6, 5, 4, 4, 1, 5, 5, 1, 5, 4, 6, 3, 4, 7,
       8, 3, 7, 4, 8, 1, 2, 6, 3, 2, 8, 4, 2, 6, 2, 2, 1, 7, 5, 1, 3, 1,
       3, 1, 1, 1, 6, 5, 7, 5, 4, 6, 5, 7, 1, 3, 3, 4], dtype=int32)

In [107]:
labels = np.array(labels, dtype='int32')

In [108]:
labels

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int32)

In [109]:
# Train the model
model.fit([target_words, context_words], labels, epochs=100, batch_size=64)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - loss: 0.6932
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6931
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6931
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6930
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6930
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6929
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.6929
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.6928
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.6927
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.6927
Epoch 11

<keras.src.callbacks.history.History at 0x15b1e28d0>

In [110]:
# Extract word embeddings
word_embeddings = model.get_layer('embedding_layer').get_weights()[0]


In [111]:

# Display embeddings for each word
for word, idx in word2id.items():
    print(f"{word}: {word_embeddings[idx]}")

# Optionally, save the embeddings for future use
np.save("word_embeddings.npy", word_embeddings)

the: [ 0.22783197  0.12957871  0.21390747 -0.17053804  0.10376602 -0.08613596
  0.20308611  0.12903692  0.03685808  0.00921578 -0.08957214 -0.14974369
  0.06510255  0.25735277 -0.28280315  0.09613296 -0.20916896  0.08487008
 -0.07253682 -0.29879108 -0.01111928  0.07085732  0.1809975   0.12609911
  0.24022052 -0.34855828  0.08611176  0.17029685  0.23318852  0.16632654
 -0.11691743  0.14784802  0.25143316 -0.23180293 -0.17881784 -0.03674853
 -0.08406601  0.10525127 -0.03709362  0.11429724 -0.17650415  0.22251196
 -0.29491422 -0.13888995 -0.2919514   0.2195002   0.25723886  0.06063935
  0.17626472  0.06257423]
quick: [-0.23472063  0.1615324  -0.10270863  0.2051066   0.13953406 -0.1955011
 -0.00847828 -0.31697652 -0.06857249 -0.4158084   0.34350595  0.22592224
 -0.40323812 -0.17187224  0.08873279 -0.33600223  0.03542912  0.3636165
  0.19307414  0.0388247   0.40843257  0.40250927 -0.21180703 -0.03580203
  0.05718663  0.0808517  -0.0363206  -0.15594023 -0.12849638  0.30706027
 -0.05862397  0

In [112]:
word = 'quick'
word_embedding = word_embeddings[word2id[word]].reshape(1, -1)
word_embeddinga = word_embeddings[word2id[word]]

print(word_embedding.shape, word_embeddinga.shape)


(1, 50) (50,)


In [113]:
word_embeddings.shape

(9, 50)

In [114]:
def find_similar_words(word, top_n=3):
    if word not in word2id:
        print(f"'{word}' not in vocabulary.")
        return []
    
    # Get the embedding of the given word
    word_embedding = word_embeddings[word2id[word]].reshape(1, -1)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(word_embedding, word_embeddings)
    print(similarities)
    similarities = similarities.flatten()
    print(similarities)
    # Exclude the word itself and get top N indices
    similar_indices = similarities.argsort()[-top_n-1:-1]  # Sorted in ascending order
    similar_indices = list(similar_indices)  # Convert to list to use reverse()
    similar_indices.reverse()  # Reverse the list in place for descending order

    # Get the similar words and their similarities
    similar_words = [(id2word[i], similarities[i]) for i in similar_indices if i in id2word]
    
    return similar_words


In [115]:
# Test similar words to "dog"
print("Words similar to 'dog':")
print(find_similar_words("dog"))


Words similar to 'quick':
[[-0.12907034  0.8442282  -0.6272927  -0.8342166  -0.95115316  0.1184348
   0.03436682  0.98760486  1.0000001 ]]
[-0.12907034  0.8442282  -0.6272927  -0.8342166  -0.95115316  0.1184348
  0.03436682  0.98760486  1.0000001 ]
[('lazy', np.float32(0.98760486)), ('the', np.float32(0.8442282)), ('jumps', np.float32(0.1184348))]
