In [1]:
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import nltk
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

##  Positional encoding

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
word1 = 'black'
word2 = 'brown'
pos1 = 2
pos2 = 10

#‘text.txt’ file
sample = open("text.txt", "r")
s = sample.read()

# processing escape characters
f = s.replace("\n", " ")

data = []

# dimension of the vector for the model
d_model = 512

# sentence parsing
for i in nltk.tokenize.sent_tokenize(f):
	temp = []
	# tokenize the sentence into words
	for j in nltk.tokenize.word_tokenize(i):
		temp.append(j.lower())
	data.append(temp)

model = Word2Vec(data, min_count=1, vector_size=d_model, window=5, sg=1)

# getting word embeddings with trained model of Word2Vec
a = model.wv[word1]
b = model.wv[word2]
print(a.shape, b.shape)

aa = a.reshape(1,512)
ba = b.reshape(1, 512)
print(aa.shape, ba.shape)

# cosine similarity
cosine_similarity(aa, ba)

(512,) (512,)
(1, 512) (1, 512)


array([[0.99951637]], dtype=float32)

0.9995 similarity between word embeddings of word 'black' and 'brown'.

In [4]:
def positional_vector(pos, d_model):
    """Function to return positional information added to the word embeddings

    Arguments:
        pos -- The position of the word in the sentence/ corpus
    """
    pv = np.random.randn(1,512)
    d_model = 512
    # even for sine function, odd for cosine
    for i in range(0,512, 2):
        pv[0][i] = np.sin(pos/10000**((2*i)/d_model))
        pv[0][i+1] = np.cos(pos/10000**((2*i)/d_model))

    return pv

In [5]:
np.random.randn(1,512).shape

(1, 512)

In [6]:
# getting positional vector
# for the information of the position of the word
pe1 = aa.copy() # with the same shape as the word embeddings
pe2 = ba.copy()
cosine_similarity(positional_vector(2, d_model), positional_vector(10, d_model))

array([[0.86000133]])

## Hence, 86.00 % similarity between positions of the words!


Add 'small values representing positional vector' to the word embeddings so that positions are taken into account!

In [7]:
# for final positional encoding
paa1 = aa.copy()
pba2 = ba.copy()

In [8]:
# positional encoding combining both positional vector and word embeddings
def positional_encoding(pos, d_model, paa):
    """
        Function to return positional encoding with combination of word embeddings
        and positional vector
    Arguments:
        pos: Position of the word
        pe: Positional vector variable of that shape
        d_model: Dimension of the vector required for the model
        paa: Variable to hold final positional Encoding of the word
    """
    pv = np.random.randn(1,d_model)
    for i in range(0,d_model, 2):
        pv[0][i] = np.sin(pos/10000**((2*i)/d_model))

        # Value of the word embedding is increased by the product with sqrt of 512
        # Simple addition of the positional information to the word embedding
        paa[0][i] = (paa[0][i] * np.sqrt(d_model)) + pv[0][i]

        pv[0][i+1] = np.cos(pos/10000**((2*i)/d_model))
        paa[0][i+1] = (paa[0][i+1] * np.sqrt(d_model)) + pv[0][i+1]

    return paa

In [9]:
cosine_similarity(positional_encoding(2, d_model, paa1), positional_encoding(10, d_model, pba2))

array([[0.9622495]], dtype=float32)

Hence, the similarity between **Positional Encoding** between the words 'black' and 'brown' is 95.97%.


*Inclusion of word embeddings with positional vector*