# **Skip-gram model**

In [1]:
import re
import numpy as np

# **Preparing the data and parameters**

The very first step will be the preparation of data, here first using the function called tokenize we will tokenize our raw strings which are part of current news affairs.  

In [8]:
# Function to tokenize the data
def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

In [9]:
# data and tokens
data = 'British Prime Minister Boris Johnson late Tuesday named his Iraqi-born education secretary, Nadhim Zahawi, as finance minister after the shock resignation of Rishi Sunak.'
tokens = tokenize(data)
print(tokens)

['british', 'prime', 'minister', 'boris', 'johnson', 'late', 'tuesday', 'named', 'his', 'iraqi', 'born', 'education', 'secretary', 'nadhim', 'zahawi', 'as', 'finance', 'minister', 'after', 'the', 'shock', 'resignation', 'of', 'rishi', 'sunak']


As we can see using the re.compile() method we have successfully tokenized the given sentence. 

Now let’s obtain the pairs of tokens and respective IDs with the help enumerate method so that by latter by using IDs we can train the model easily and lastly retrieve respective words easily.  

In [10]:
# generating tokens and Ids
id_to_word = {i:x for (i, x) in enumerate(tokens)}
word_to_id = {x:i for (i, x) in enumerate(tokens)}

Below we can see two generated dictionaries. 

In [11]:
# grab the pairs of ID's and words
print(word_to_id)
print(id_to_word)

{'british': 0, 'prime': 1, 'minister': 17, 'boris': 3, 'johnson': 4, 'late': 5, 'tuesday': 6, 'named': 7, 'his': 8, 'iraqi': 9, 'born': 10, 'education': 11, 'secretary': 12, 'nadhim': 13, 'zahawi': 14, 'as': 15, 'finance': 16, 'after': 18, 'the': 19, 'shock': 20, 'resignation': 21, 'of': 22, 'rishi': 23, 'sunak': 24}
{0: 'british', 1: 'prime', 2: 'minister', 3: 'boris', 4: 'johnson', 5: 'late', 6: 'tuesday', 7: 'named', 8: 'his', 9: 'iraqi', 10: 'born', 11: 'education', 12: 'secretary', 13: 'nadhim', 14: 'zahawi', 15: 'as', 16: 'finance', 17: 'minister', 18: 'after', 19: 'the', 20: 'shock', 21: 'resignation', 22: 'of', 23: 'rishi', 24: 'sunak'}


Now we will prepare the training data, which is just a NumPy array of tokens for the whole above data where the x array represents words and the y array represent the context words for the corresponding x word lastly we are also expanding the dimension of the generated data.

In [12]:
# training data
def generate_training_data(tokens, word_to_id, window_size):
    X, Y = [], []

    for i in range(len(tokens)):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(len(tokens), i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
            
    return np.array(X), np.array(Y)

Now let’s generate the data. 

In [13]:
# Expanding dim
def expand_dims(x, y):
    x = np.expand_dims(x, axis=0)
    y = np.expand_dims(y, axis=0)
    return x, y

In [14]:
# generate the x and y pair
x, y = generate_training_data(tokens, word_to_id, 3)
x, y = expand_dims(x, y)

In [15]:
x, y

(array([[ 0,  0,  0,  1,  1,  1,  1, 17, 17, 17, 17, 17,  3,  3,  3,  3,
          3,  3,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,
          6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,
          9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11,
         11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14,
         14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
         19, 19, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 22, 22,
         22, 22, 22, 23, 23, 23, 23, 24, 24, 24]]),
 array([[ 1, 17,  3,  0, 17,  3,  4,  0,  1,  3,  4,  5,  0,  1, 17,  4,
          5,  6,  1, 17,  3,  5,  6,  7, 17,  3,  4,  6,  7,  8,  3,  4,
          5,  7,  8,  9,  4,  5,  6,  8,  9, 10,  5,  6,  7,  9, 10, 11,
          6,  7,  8, 10, 11, 12,  7,  8,  9, 11, 12, 13,  8,  9, 10, 12,
         13, 14,  9, 10, 11, 13, 14, 15, 10, 11, 12, 14, 15, 16, 11, 12,

Now below we write two functions, the first is to generate the training parameters for the models and the second is just the sigmoid activation function. 

In [16]:
# Parameter initialization 
def init_parameters(vocab_size, emb_size):
    wrd_emb = np.random.randn(vocab_size, emb_size) * 0.01
    w = np.random.randn(vocab_size, emb_size) * 0.01
    
    return wrd_emb, w

In [17]:
# activation function 
def softmax(z):
    return np.divide(np.exp(z), np.sum(np.exp(z), axis=0, keepdims=True) + 0.001)

# **Forward propagation**

From the second step onward we are building the model, the model building is divided into two parts forward propagation and backward propagation. 

In forward propagation, we are using the three functions first is for forwarding pass, second is for obtaining cost function and last is to calculate the difference between predictions. 

In [18]:
# Forward propogation

def forward(inds, params):
    wrd_emb, w = params
    word_vec = wrd_emb[inds.flatten(), :].T
    z = np.dot(w, word_vec)
    out = softmax(z)
    
    cache = inds, word_vec, w, z
    
    return out, cache


def cross_entropy(y, y_hat):
    m = y.shape[1]
    cost = -(1 / m) * np.sum(np.sum(y_hat * np.log(y + 0.001), axis=0, keepdims=True), axis=1)
    return cost


def dsoftmax(y, out):
    dl_dz = out - y
    
    return dl_dz

# **Backward propogation**

The backward propagation is simple propagation where we are going to propagate the error to all states of models and will update the model state.  

In [19]:
# Backword propogation 
def backward(y, out, cache):
    inds, word_vec, w, z = cache
    wrd_emb, w = params
    
    dl_dz = dsoftmax(y, out)
    # deviding by the word_vec length to find the average
    dl_dw = (1/word_vec.shape[1]) * np.dot(dl_dz, word_vec.T)
    dl_dword_vec = np.dot(w.T, dl_dz)
    
    return dl_dz, dl_dw, dl_dword_vec

def update(params, cache, grads, lr=0.03):
    inds, word_vec, w, z = cache
    wrd_emb, w = params
    dl_dz, dl_dw, dl_dword_vec = grads
    
    wrd_emb[inds.flatten(), :] -= dl_dword_vec.T * lr
    w -= dl_dw * lr
    
    return wrd_emb, w

# **Training the model**

So far now we have defined the model and prepared the data, and now we will start training the model.  Below we will first define some parameters vocab_size, batch_size, etc. 

Now we will start training the model, we will train the model for 50k epochs. The procedure is can be simply understood as the first forward propagation will be carried out and later whatever the error obtained that we will transfer it to all states of the model using the backpropagation algorithms.     

In [20]:
# Training the model
vocab_size = len(id_to_word)

m = y.shape[1]
y_one_hot = np.zeros((vocab_size, m))
y_one_hot[y.flatten(), np.arange(m)] = 1

y = y_one_hot


batch_size=256
embed_size = 50

params = init_parameters(vocab_size, 50)

costs = []

for epoch in range(50000):
    epoch_cost = 0
    
    batch_inds = list(range(0, x.shape[1], batch_size))
    np.random.shuffle(batch_inds)
    
    for i in batch_inds:
        x_batch = x[:, i:i+batch_size]
        y_batch = y[:, i:i+batch_size]
        
        pred, cache = forward(x_batch, params)
        grads = backward(y_batch, pred, cache)
        params = update(params, cache, grads, 0.03)
        cost = cross_entropy(pred, y_batch)
        
        epoch_cost += np.squeeze(cost)
        
    costs.append(epoch_cost)
    
    if(epoch % 250 == 0):
        print("Cost after epoch {}: {}".format(epoch, epoch_cost))

Cost after epoch 0: 3.19432228549854
Cost after epoch 250: 3.1832204859407445
Cost after epoch 500: 3.086581163872388
Cost after epoch 750: 2.653024735089586
Cost after epoch 1000: 2.3337815762741427
Cost after epoch 1250: 2.194864087368703
Cost after epoch 1500: 2.1233194073146953
Cost after epoch 1750: 2.08954918637509
Cost after epoch 2000: 2.075744475214754
Cost after epoch 2250: 2.0724681561095806
Cost after epoch 2500: 2.07142851122357
Cost after epoch 2750: 2.070763756286456
Cost after epoch 3000: 2.07450135029756
Cost after epoch 3250: 2.0791779612052506
Cost after epoch 3500: 2.0811465617364187
Cost after epoch 3750: 2.0840210450589645
Cost after epoch 4000: 2.086691384309729
Cost after epoch 4250: 2.087092068990985
Cost after epoch 4500: 2.081731342508219
Cost after epoch 4750: 2.07329892542344
Cost after epoch 5000: 2.0701366687312155
Cost after epoch 5250: 2.072271086662586
Cost after epoch 5500: 2.075905639665428
Cost after epoch 5750: 2.0794101084474717
Cost after epoch 6

# **Predicting the samples**

Now we will obtain the prediction on samples, the test data is simply generated by arranging the token numbers that we are going to obtain the prediction for each token. 



In [21]:
# generating skip grams for Id's 0 to 24
x_test = np.arange(vocab_size)
x_test = np.expand_dims(x_test, axis=0)
softmax_test, _ = forward(x_test, params)
top_sorted_inds = np.argsort(softmax_test, axis=0)[-4:,:]

In [22]:
# visualizing the result 
for input_ind in range(vocab_size):
    input_word = id_to_word[input_ind]
    output_words = [id_to_word[output_ind] for output_ind in top_sorted_inds[::-1, input_ind]]
    print("{}'s skip-grams: {}".format(input_word, output_words))

british's skip-grams: ['boris', 'prime', 'minister', 'education']
prime's skip-grams: ['johnson', 'boris', 'british', 'minister']
minister's skip-grams: ['nadhim', 'his', 'education', 'minister']
boris's skip-grams: ['tuesday', 'late', 'johnson', 'minister']
johnson's skip-grams: ['named', 'tuesday', 'late', 'minister']
late's skip-grams: ['his', 'named', 'tuesday', 'minister']
tuesday's skip-grams: ['iraqi', 'his', 'named', 'late']
named's skip-grams: ['born', 'his', 'iraqi', 'late']
his's skip-grams: ['education', 'iraqi', 'named', 'born']
iraqi's skip-grams: ['secretary', 'education', 'his', 'named']
born's skip-grams: ['nadhim', 'secretary', 'education', 'named']
education's skip-grams: ['zahawi', 'iraqi', 'his', 'born']
secretary's skip-grams: ['as', 'iraqi', 'education', 'zahawi']
nadhim's skip-grams: ['born', 'finance', 'as', 'education']
zahawi's skip-grams: ['minister', 'finance', 'as', 'education']
as's skip-grams: ['after', 'finance', 'zahawi', 'secretary']
finance's skip-gr

# **Continuous bag of words**

This method helps in completing a partial incomplete sentence by predicting the words that can be fitted into the middle of the sentence based on the surrounding context of the words. The context of prediction depends on the few words before and after the predicted word.

The CBOW model can be simply implemented using the python-based library called gensim, below we are importing all the necessary modules and packages. 

In [23]:
# imports needed and logging
import gzip
import gensim 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
import tensorflow as tf

The data that we are using for the CBOW is stored in .txt file and which is the first chapter of the famous mythological book Harry Potter.  

In [25]:
get_file = tf.keras.utils.get_file('horror.txt','https://analyticsindiamag.com/wp-content/uploads/2022/07/horror.txt')

Downloading data from https://analyticsindiamag.com/wp-content/uploads/2022/07/horror.txt


The below-defined function is used to preprocess the text data line by line, the function takes the .txt file and returns the tokenized version of each line. 

In [26]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with open(input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(get_file))
logging.info("Done reading data file") 

Now below simply we will import the word2vec model from the gensim library and will pass the data by definition some parameters like vocab_size, window, min_count, etc and we are going to train the model for 5 epochs. 

In [27]:
# build vocabulary and train model
model = gensim.models.Word2Vec(documents, size=50, window=50, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=5)



(292162, 394710)

After training the model we will obtain some predictions, now let’s calculate the similar words for some test words as below.  

In [28]:
# similar word for given querry
w1 = 'harry'
model.wv.most_similar(positive=w1)

[('flaming', 0.9916119575500488),
 ('lit', 0.9911414980888367),
 ('leaned', 0.9907791018486023),
 ('voices', 0.9897785782814026),
 ('crowded', 0.9896672964096069),
 ('slid', 0.9892847537994385),
 ('welcome', 0.9892795085906982),
 ('coins', 0.9883919954299927),
 ('clambered', 0.9883610606193542),
 ('sliding', 0.987615168094635)]

In [29]:
w1 = 'neighbors'
model.wv.most_similar(positive=w1)

[('stools', 0.9426444172859192),
 ('fungi', 0.9417978525161743),
 ('baggy', 0.9414395093917847),
 ('unblinkingly', 0.9409927129745483),
 ('knobbly', 0.9407731294631958),
 ('talked', 0.9407579302787781),
 ('halt', 0.9398818016052246),
 ('coats', 0.9396645426750183),
 ('swiftly', 0.9396216869354248),
 ('chair', 0.9391561150550842)]

In [30]:
w1 = 'beans'
model.wv.most_similar(positive=w1)

[('dudleyâ', 0.9992722868919373),
 ('every', 0.999259352684021),
 ('later', 0.9990869164466858),
 ('racing', 0.9989426136016846),
 ('themselves', 0.9989418983459473),
 ('however', 0.998867392539978),
 ('wild', 0.9988646507263184),
 ('leaving', 0.998769998550415),
 ('rather', 0.9987576007843018),
 ('horrible', 0.998711347579956)]

In [31]:
# similarity between two unrelated words
model.wv.similarity(w1="beans",w2="neighbors")

0.8912206