We will build the Skipgram and CBOW models from scratch, train them on a relatively small corpus, i.e, on BBC Data set.

In [None]:
import numpy as np
import keras.backend as K
import tensorflow as tf
import operator
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd


In [None]:
df = pd.read_csv('/content/bbc-text.csv')
print(df)
sentences = ''
articles = list(df['text'])

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


## Skip Gram

In [None]:
%%time

sentences = []

for i in articles[:80]:
    sentences += i.split('.')

# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in sentences if sentence.count(" ") >= 2]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)

# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

CPU times: user 101 ms, sys: 1.59 ms, total: 103 ms
Wall time: 112 ms


In [None]:
n_samples, V

(29158, 5368)

In [None]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items()))[:5])

[('the', 1), ('to', 2), ('of', 3), ('a', 4), ('and', 5)]


In [None]:

# Parameters
window_size = 2
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)


In [None]:

# Prepare data for the skipgram model
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size * 2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1

            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    all_in.append(word)
                    # Add one-hot of the context words
                    all_out.append(to_categorical(words[i], V))

    return (np.array(all_in), np.array(all_out))

In [None]:
%%time

# Create training data
X_skip, y_skip = generate_data_skipgram(corpus, window_size, V)
X_skip.shape, y_skip.shape

CPU times: user 1.22 s, sys: 1.43 s, total: 2.64 s
Wall time: 2.65 s


((108016,), (108016, 5368))

In [None]:
%%time

# Create skipgram architecture

dim = 100
skipgram_models = []

# Initialize a Keras Sequential model
skipgram = Sequential()

# Add an Embedding layer
skipgram.add(Embedding(input_dim=V,
                        output_dim=dim,
                        input_length=1,
                        embeddings_initializer='glorot_uniform'))

# Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
skipgram.add(Reshape((dim, )))

# Add a final Dense layer with the same size as in [1]
skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

# Compile the model with a suitable loss function and select an optimizer.
# Optimizer Adagrad was used in paper
skipgram.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

skipgram.summary()
print("")
skipgram_models.append(skipgram)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 100)            536800    
                                                                 
 reshape (Reshape)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 5368)              542168    
                                                                 
Total params: 1078968 (4.12 MB)
Trainable params: 1078968 (4.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

CPU times: user 646 ms, sys: 235 ms, total: 881 ms
Wall time: 2.73 s


In [None]:
%%time


# Training the skipgram models
for skipgram in skipgram_models:
    skipgram.fit(X_skip, y_skip, batch_size=64, epochs=15, verbose=1)
    print("")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

CPU times: user 2min 12s, sys: 15.5 s, total: 2min 28s
Wall time: 3min 27s


In [None]:

for skipgram in skipgram_models:
    weights = skipgram.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))

    # Start a new line
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [None]:
skipgram.get_weights()[0]

array([[-0.0280514 ,  0.00633277, -0.0297009 , ..., -0.01276754,
        -0.00261297,  0.01973111],
       [ 0.2016724 , -0.34099865,  0.06056718, ...,  0.05937553,
         0.6281242 ,  0.2644408 ],
       [-0.59093523, -0.08562579,  0.26864326, ...,  0.1961121 ,
        -0.18560983,  0.04168729],
       ...,
       [-0.3091724 ,  0.20727606,  0.13891563, ..., -0.49539888,
        -0.28663093,  0.19539887],
       [ 0.27804348, -0.03453066, -0.06964883, ..., -0.13181476,
         0.06794297, -0.14912298],
       [-0.08548996, -0.12734386,  0.01529881, ..., -0.12156113,
         0.501686  ,  0.36864257]], dtype=float32)

In [None]:
len(skipgram.get_weights())

3

In [None]:
len(skipgram.get_weights()[0])

5368

In [None]:
len(skipgram.get_weights()[0][0])

100

In [None]:
skipgram.get_weights()[0][1]

array([ 0.2016724 , -0.34099865,  0.06056718, -0.02997063, -0.2095625 ,
       -0.10585789,  0.12256892,  0.16617462,  0.3222422 , -0.23193647,
       -0.19682544, -0.02064507,  0.14807023,  0.23470451,  0.10948326,
       -0.25430372,  0.21665928,  0.3309034 , -0.21474394, -0.17216541,
        0.1988407 ,  0.2989586 , -0.31570372,  0.1974094 ,  0.32606086,
        0.24848923, -0.0194772 ,  0.33561116, -0.17210004, -0.16591637,
       -0.18304642, -0.01622173, -0.18109913,  0.05863833,  0.07199619,
        0.22383435, -0.09396132,  0.2805166 , -0.2570826 ,  0.19285458,
       -0.01536875, -0.3160898 ,  0.09866587, -0.03627656, -0.09316628,
        0.21988797,  0.04269576, -0.08087586, -0.29026258,  0.06641474,
        0.07396381,  0.1643963 ,  0.4445107 , -0.46609503,  0.2517619 ,
       -0.32623035,  0.2524388 ,  0.5811687 , -0.0214692 , -0.14889832,
        0.04396792, -0.23036313, -0.20201139,  0.00559659, -0.25663364,
        0.16170435,  0.2641098 , -0.11385378,  0.07858003,  0.07

To get the word embedding:

In [None]:
index = tokenizer.word_index['king']

In [None]:
skipgram.get_weights()[0][index]

array([ 0.3329974 , -0.34593946,  0.21893133, -0.00926998,  0.5448755 ,
       -0.25804842, -0.31043002,  0.5402896 ,  0.18649508,  0.19944073,
       -0.125581  , -0.20627189, -0.37291726, -0.06565852, -0.20601559,
       -0.36603346,  0.21348958, -0.00171248, -0.09430709, -0.09572998,
        0.32611454,  0.24519673, -0.16359042,  0.12024733, -0.26432282,
       -0.34060258, -0.07175371,  0.69065845, -0.32866818, -0.04020369,
        0.32348067, -0.74996585,  0.23313628, -0.4581679 ,  0.07766113,
        0.05767743, -0.04669981, -0.29721177,  0.6522683 , -0.04243769,
       -0.5192642 ,  0.36859947,  0.21754432, -0.03036747,  0.41784155,
       -0.17789076,  0.53237957, -0.66123646,  0.18967815,  0.33967313,
       -0.18866128,  0.15688777, -0.16077752, -0.44108018, -0.2192727 ,
        0.22307149,  0.33844784, -0.11983999, -0.03577344, -0.16427666,
        0.5541761 , -0.2896473 , -0.49448028,  0.21642114, -0.40521762,
       -0.71553487, -0.5385699 , -0.29812378, -0.15177771, -0.59

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions
# You should replace this with your actual word embeddings

# Sample code for loading pre-trained word vectors into a dictionary
word_vectors = {}
i=0

target_word = "prince"


with open("/content/vectors_skipgram_100.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [word for word, _ in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


skipgram_word_emd = word_vectors

The 10 nearest words to 'prince' are: 
grandson
kennedy
princes
camilla
mood
charles
rowntree
ros
becomes
22


## CBOW

In [None]:

from keras.preprocessing import sequence

# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))

    return (np.array(all_in), np.array(all_out))

In [None]:
%%time


# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

CPU times: user 548 ms, sys: 63.6 ms, total: 612 ms
Wall time: 615 ms


((29158, 4), (29158, 5368))

In [None]:
%%time

# Create the CBOW architecture
cbow_models = []
dim = 100
cbow = Sequential()

# Add an Embedding layer
cbow.add(Embedding(input_dim=V,
                    output_dim=dim,
                    input_length=window_size*2, # Note that we now have 2L words for each input entry
                    embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

cbow.summary()
print("")
cbow_models.append(cbow)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            536800    
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 5368)              542168    
                                                                 
Total params: 1078968 (4.12 MB)
Trainable params: 1078968 (4.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

CPU times: user 257 ms, sys: 91.6 ms, total: 349 ms
Wall time: 364 ms


In [None]:
%%time

# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
    print("")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

CPU times: user 2min 10s, sys: 12.6 s, total: 2min 23s
Wall time: 3min 23s


In [None]:

for cbow in cbow_models:
    weights = cbow.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f'vectors_cbow_{len(embedding[0])}.txt', 'w')

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Loading pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions



word_vectors = {}
i=0

target_word = "king"


with open("/content/vectors_cbow_100.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [word for word, _ in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


cbow_word_emd = word_vectors

The 10 nearest words to 'king' are: 
thin
restructuing
cream
mallorcan
becomes
violence
became
dance
bundled
baskin


To get the word embedding:

In [None]:
len(skipgram_word_emd),len(cbow_word_emd)

(5367, 5367)

In [None]:
skipgram_word_emd['king']

array([ 0.3329974 , -0.34593946,  0.21893133, -0.00926998,  0.5448755 ,
       -0.25804842, -0.31043002,  0.5402896 ,  0.18649508,  0.19944073,
       -0.125581  , -0.20627189, -0.37291726, -0.06565852, -0.20601559,
       -0.36603346,  0.21348958, -0.00171248, -0.09430709, -0.09572998,
        0.32611454,  0.24519673, -0.16359042,  0.12024733, -0.26432282,
       -0.34060258, -0.07175371,  0.69065845, -0.32866818, -0.04020369,
        0.32348067, -0.74996585,  0.23313628, -0.4581679 ,  0.07766113,
        0.05767743, -0.04669981, -0.29721177,  0.6522683 , -0.04243769,
       -0.5192642 ,  0.36859947,  0.21754432, -0.03036747,  0.41784155,
       -0.17789076,  0.53237957, -0.66123646,  0.18967815,  0.33967313,
       -0.18866128,  0.15688777, -0.16077752, -0.44108018, -0.2192727 ,
        0.22307149,  0.33844784, -0.11983999, -0.03577344, -0.16427666,
        0.5541761 , -0.2896473 , -0.49448028,  0.21642114, -0.40521762,
       -0.71553487, -0.5385699 , -0.29812378, -0.15177771, -0.59

In [None]:
cbow_word_emd['king']

array([ 0.40849736,  0.83842176,  0.76997   , -0.22403689,  1.1738381 ,
       -1.195571  , -0.5294099 , -0.2563375 ,  1.004601  , -1.2284474 ,
       -0.14232187,  0.10904235,  0.5015107 ,  0.48668098, -0.18598354,
        0.9957963 ,  0.04341465, -1.0003383 , -0.52898973,  0.16919012,
        1.0728117 ,  0.04841719, -1.6397748 ,  0.57037044,  0.4790564 ,
        0.20876846, -0.05828866, -0.49795058, -0.32566577,  0.6376983 ,
       -0.23290294,  0.05514616,  1.9663172 ,  0.3662006 ,  0.4217175 ,
       -1.1287767 ,  1.1819786 ,  0.06207106,  0.4526754 , -0.12408181,
       -0.80580086,  0.460728  , -0.72695285, -1.025451  ,  1.1780716 ,
        1.0432673 ,  0.10634957, -0.766422  ,  0.39446223, -0.3177677 ,
        0.6336937 , -0.51412696, -0.26367947, -0.17996639,  0.46548152,
        0.75701135, -0.02343865, -0.45402536,  0.34043965,  0.8864516 ,
       -0.4046189 ,  0.58849335, -1.1570829 ,  0.6935536 ,  0.23517364,
       -0.5019113 ,  0.6715626 ,  0.22422247,  0.61462957,  1.29

In [None]:
cosine_similarity([skipgram_word_emd['king']], [skipgram_word_emd['queen']])

array([[0.35694565]])

In [None]:
cosine_similarity([cbow_word_emd['king']], [cbow_word_emd['queen']])

array([[0.32032359]])