# NLP:Assignment 1 Engine Search
Name: Sitthiwat Damrongpreechar <br>
Student ID: st123994

In [1]:
# Check Python version
! python -V

Python 3.10.8


In [2]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import math

In [3]:
np.__version__,torch.__version__

('1.24.4', '2.1.0+cu118')

## 1. Load Data

In [4]:
# import nltk and download reuters corpus
import nltk
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\prasi\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prasi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# check the corpus
from nltk.corpus import reuters
#reuters.words()

In [6]:
# check the number of words in the corpus
len(reuters.words())

1720901

### 1.1 Tokenization

In [7]:
# corpus_full =[reuters.words(i).lower() for i in reuters.fileids()] 
corpus_full = reuters.sents()

In [8]:
# limit the number of sentences to 4500
corpus = corpus_full[:4500]
corpus = [[i.lower() for i in sent] for sent in corpus]

In [9]:
print(len(corpus))

4500


### 1.2 Numeralization

In [10]:
# Find the unique words
flatten = lambda l : [item for sublist in l for item in sublist]
# Assign unique integer to each word
vocabs = list(set(flatten(corpus)))
len(vocabs)

9250

In [11]:
# Create handy mapping between integer and word
word2index= {v:idx for idx,v in enumerate(vocabs)}
len(word2index)

9250

In [12]:
# adding the case of unknown word
vocabs.append("<UNK>")
word2index['<UNK>'] = len(word2index)-1
len(word2index)

9251

In [13]:
index2word = {v:k for k,v in word2index.items()}
len(index2word)

9250

## 2. Prepare train data

### 2.1 Word2Vec and Word2Vec(negative)

In [14]:
from collections import Counter

In [15]:
# prepare the data for training
def prepare_sequence(seq,word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [16]:
# create pairs of center word, and outside word
def random_batch(batch_size,corpus, window_size=2):
    skipgram=[]
    # loop each corpus (1 corpus)
    # loop each document 
    for doc in corpus:
        # look from the 3nd word until third last word
        for i in range(window_size, len(doc) - window_size):
            # center word
            center = word2index[doc[i]]
            # outside word = 4 words
            outside = (word2index[doc[i-2]],word2index[doc[i-1]],word2index[doc[i+1]],word2index[doc[i+2]]) 
            # for each of these two outside words, we gonna append to alist
            for each_out in outside:
                # center, outside1; center outside2; center outside3; center outside4
                skipgram.append([center,each_out])

    random_index = np.random.choice(range(len(skipgram)),batch_size, replace=False)
    input, label = [],[]
    for index in random_index:
        input.append([skipgram[index][0]])
        label.append([skipgram[index][1]])

    return np.array(input),np.array(label)    

In [17]:
# create the unigram table for negative sampling
def unigramtable(corpus):
    z=0.001
    vocabs = list(set(flatten(corpus)))
    # count the frequency of each word
    word_count = Counter(flatten(corpus))
    # get the total of words
    num_total_words = sum([c for w,c in word_count.items()])
    unigram_table = []
    for word in vocabs:
        uw = word_count[word] / num_total_words
        uw_alpha = (uw **0.75) / z
        unigram_table.extend([word] * int(uw_alpha))

    return unigram_table
    

### 2.2 Glove

In [18]:
# create the ramdom batch for glove
def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    random_inputs, random_labels, random_coocs, random_weightings = [],[],[],[]
    
    # conver our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]],word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    # randomply choose indexes based on batch size 
    random_index = np.random.choice(range(len(skip_grams_id)),batch_size,replace=False)
    
    # get the random in input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])

        # coos
        pair = skip_grams[index] # e.g., ('banana', 'fruit')
        try: # if the co-occurences is available
            cooc = X_ik[pair] # e.g., 3
        except: # if not available, set to 1
            cooc = 1
        random_coocs.append([math.log(cooc)])

        # weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])

    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

#### 2.2.1 Build Co-occurance Matrix X

In [20]:
# create the skipgram for glove
from collections import Counter
window_size=2
X_i = Counter (flatten(corpus))
skipgrams_glove = []

for doc in corpus:
    for i in range(2,len(doc)- window_size):
        center = doc[i]
        outside = [doc[i-2],doc[i-1],doc[i+1],doc[i+2]]
        for each_out in outside:
            skipgrams_glove.append((center,each_out))

X_ik_skipgram_glove = Counter(skipgrams_glove)

#### 2.2.2 Weighting Function

In [21]:
# weighting function
def weighting(w_i,w_j, X_ik):
    # check whether the co-occurences between w_i and w_j is available.
    try:
        x_ij = X_ik[(w_i,w_j)]
    # if not exist, then set to 1 ("laplace smoothing")
    except:
        x_ij = 1
    # set xmax = 100
    x_max = 100
    # set alpha = 0.75
    alpha = 0.75
    # if co-occurences does not exceed xmax, then just multiply with some alpha.
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha 
    # otherwise, set to 1.
    else:
        result = 1
    return result #weighting_function

In [22]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #keeping the weighting function result

for bigram in combinations_with_replacement(vocabs,2):
    # if the pair exist in our copus
    if X_ik_skipgram_glove.get(bigram):
        co = X_ik_skipgram_glove[bigram]
        X_ik[bigram] = co + 1 # for stability
        X_ik[(bigram[1],bigram[0])] = co + 1 # for symmetry ex. apple,banana = banana,apple
    else:
        pass

    weighting_dic[bigram] = weighting(bigram[0],bigram[1],X_ik)
    weighting_dic[(bigram[1],bigram[0])] = weighting(bigram[1],bigram[0],X_ik)

## 3. Model

#### 3.1 Skipgram Model

In [23]:
import torch.nn as nn
# Word2Vec Model
class Skipgram(nn.Module):
    def __init__(self,voc_size,emb_size):
        super(Skipgram,self).__init__()
        self.embedding_center =nn.Embedding(voc_size, emb_size)
        self.embedding_outside =nn.Embedding(voc_size, emb_size)

    def forward (self, center, outside, all_vocabs):
        center_embedding = self.embedding_center(center) # (batch_size, 1, emb_size)
        outside_embedding = self.embedding_outside(outside) # (batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_outside(all_vocabs) # (batch_size, voc_size, emb_size)
        assert center.size(0) == outside.size(0) == all_vocabs.size(0), "Batch size mismatch"
        top_term =torch.exp(outside_embedding.bmm(center_embedding.transpose(1,2)).squeeze(2))
        # (batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) 
        # => (batch_size, 1, 1).squeeze(2) = (batch_size, 1)

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1,2)).squeeze(2)
        # (batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) 
        # => (batch_size, voc_size, 1).squeeze(2) = (batch_size, voc_size)

        lower_term_sum = torch.sum(torch.exp(lower_term), 1) #(batch_size,1)

        loss = -torch.mean(torch.log(top_term/lower_term_sum)) #scalar

        return loss

Testing Skipgram model and method

In [24]:
test_x,test_y=random_batch(2,corpus)
model = Skipgram(len(vocabs),2)
print("Skipgram:",model)
test_input_tensor = torch.LongTensor(test_x)
test_label_tensor = torch.LongTensor(test_y)
assert torch.max(test_input_tensor) < len(vocabs), "Invalid index in input_tensor"
assert torch.max(test_label_tensor) < len(vocabs), "Invalid index in label_tensor"
print(f"Input: {test_input_tensor.shape}, Output: {test_label_tensor.shape}")


Skipgram: Skipgram(
  (embedding_center): Embedding(9251, 2)
  (embedding_outside): Embedding(9251, 2)
)
Input: torch.Size([2, 1]), Output: torch.Size([2, 1])


#### 3.2 Skipgram Negative

In [25]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    negative_sampling =[]
    for i in range(batch_size): # (1,k)
        targets_index = targets[i].item()
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if word2index[neg] == targets_index:
                continue
            nsample.append(neg)
        negative_sampling.append(prepare_sequence(nsample,word2index).reshape(1,-1))

    return torch.cat(negative_sampling) #batch_size, k

In [26]:
class SkipgramNeg(nn.Module):
    def __init__(self,vocab_size,emb_size):
        super(SkipgramNeg,self).__init__()
        self.embedding_center = nn.Embedding(vocab_size,emb_size)
        self.embedding_outside = nn.Embedding(vocab_size,emb_size)
        self.log_sigmoid = nn.LogSigmoid()      

    def forward(self,center,outside,negative_words):
        #center, outside : (bs,1)
        #negative_words : (bs,k)
        center_emb = self.embedding_center(center) # (bs,1,emb_size)
        outside_emb = self.embedding_outside(outside) # (bs,1,emb_size)
        negative_emb = self.embedding_outside(negative_words) # (bs,k,emb_size)

        uovc = outside_emb.bmm(center_emb.transpose(1,2)).squeeze(2) # (bs,1)
        ukvc = -negative_emb.bmm(center_emb.transpose(1,2)).squeeze(2) # (bs,k)
        ukvc_sum = torch.sum(ukvc,1).reshape(-1,1) # (bs,1)

        loss = self.log_sigmoid(uovc) + self.log_sigmoid(ukvc_sum)
        
        return -torch.mean(loss)

Testing Skipgram Negative model and method

In [27]:
batch_size = 2
x,y = random_batch(batch_size,corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
k = 5
unigram_table = unigramtable(corpus)
neg_samples = negative_sampling(y_tensor,unigram_table,k)
y_tensor[1],neg_samples[1]

(tensor([7569]), tensor([1345, 5720, 3177, 5153, 6515]))

In [28]:
# test your model 
emb_size = 2
vocab_size = len(vocabs)
model = SkipgramNeg(vocab_size,emb_size)
loss = model(x_tensor,y_tensor,neg_samples)
loss

tensor(1.3362, grad_fn=<NegBackward0>)

#### 3.3 GloVe

In [29]:
class Glove(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

        self.center_bias = nn.Embedding(voc_size, 1)
        self.outside_bias = nn.Embedding(voc_size, 1)
        

    def forward(self,center,outside,cooc,weighting):
        center_embeds = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)

        center_bias = self.center_bias(center).squeeze(1) #(batch_size, k)
        target_bias = self.outside_bias(outside).squeeze(1) #(batch_size, k)

        # inner-product
        inner_product = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) #(batch_size, 1)
        # (batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) -> (batch_size, 1)

        loss = weighting * torch.pow(inner_product + center_bias + target_bias - cooc, 2)

        return torch.sum(loss)

Testing GloVe model and method

In [30]:
batch_size = 2
x, y, cooc, weighting = random_batch_glove(batch_size, corpus, skipgrams_glove, X_ik, weighting_dic)
x,y,cooc,weighting

(array([[7647],
        [9233]]),
 array([[3259],
        [6132]]),
 array([[1.38629436],
        [2.30258509]]),
 array([[0.08944272],
        [0.17782794]]))

In [31]:
# Test our system
model = Glove(voc_size=len(vocabs), emb_size=2)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)
loss

tensor(1.5232, grad_fn=<SumBackward0>)

#### 3.4 GloVe (Gensim)

In [32]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# you have to put this file in some python/gensime directory; just run it and it will inform where to put the file....
glove_file = datapath('glove.6B.100d.txt') # search for this file in google and download it
GloveGensimmodel = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [33]:
# Gensim model already has most_similar function
GloveGensimmodel.most_similar('tokyo',topn=1)

[('osaka', 0.7667967677116394)]

## 4. Training

In [34]:
# prepare the parameters for training
batch_size = 2 
emb_size = 2 #50, 100, 200, 300
voc_size = len(vocabs)
k=5

Skipgrammodel = Skipgram(voc_size,emb_size)
SkipgramNegmodel = SkipgramNeg(vocab_size,emb_size)
Glovemodel = Glove(voc_size,emb_size)

critrion = nn.CrossEntropyLoss()

optimizer_skipgram = optim.Adam(Skipgrammodel.parameters(),lr=0.001)
optimizer_skipgramneg = optim.Adam(SkipgramNegmodel.parameters(),lr=0.001)
optimizer_glove = optim.Adam(Glovemodel.parameters(),lr=0.001)

num_epochs = 100

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
# output shape must be like (batch_size, voc_size)
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs.shape

torch.Size([2, 9251])

In [36]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### 4.1 Training Skipgram

In [38]:
starttime = time.time()

for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    loss = Skipgrammodel(input_batch, target_batch, all_vocabs)
    
    optimizer_skipgram.zero_grad()
    loss.backward()

    optimizer_skipgram.step()
    
    
    
    
    if (epoch + 1)% 10==0:
        end = time.time()
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | Cumulative time: {end-starttime :.4}s")
    
print(f"Training Skipgram Done!: {time.time()-starttime:.4}s")

Epoch: 10 | cost: 8.888941 | Cumulative time: 5.731s
Epoch: 20 | cost: 10.527663 | Cumulative time: 10.98s
Epoch: 30 | cost: 10.591673 | Cumulative time: 16.29s
Epoch: 40 | cost: 12.290310 | Cumulative time: 21.63s
Epoch: 50 | cost: 10.649212 | Cumulative time: 27.01s
Epoch: 60 | cost: 11.083242 | Cumulative time: 32.23s
Epoch: 70 | cost: 10.446486 | Cumulative time: 37.45s
Epoch: 80 | cost: 12.060201 | Cumulative time: 43.22s
Epoch: 90 | cost: 7.732903 | Cumulative time: 49.44s
Epoch: 100 | cost: 10.469149 | Cumulative time: 56.19s
Training Skipgram Done!: 56.19s


#### 4.2 Training Skipgram with Negative Sampling

In [39]:
starttime = time.time()
for epoch in range(num_epochs):
    
    # get batch
    input_batch, target_batch = random_batch(batch_size,corpus)
    input_tensor = torch.LongTensor(input_batch)
    target_tensor = torch.LongTensor(target_batch)

    # predict
    neg_samples = negative_sampling(target_tensor,unigram_table,k)
    loss = SkipgramNegmodel(input_tensor,target_tensor,neg_samples)

    # backpropagate
    optimizer_skipgramneg.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer_skipgramneg.step()
    
    if (epoch + 1)% 10==0:
        end = time.time()
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | Cumulative time: {end-starttime :.4}s")
    
print(f"Training Skipgram Negative Done!: {time.time()-starttime:.4}s")

Epoch: 10 | cost: 1.919172 | Cumulative time: 7.014s
Epoch: 20 | cost: 1.771240 | Cumulative time: 13.05s
Epoch: 30 | cost: 2.802083 | Cumulative time: 19.07s
Epoch: 40 | cost: 1.789256 | Cumulative time: 25.4s
Epoch: 50 | cost: 2.307091 | Cumulative time: 30.87s
Epoch: 60 | cost: 3.047858 | Cumulative time: 36.71s
Epoch: 70 | cost: 2.592919 | Cumulative time: 41.73s
Epoch: 80 | cost: 1.216159 | Cumulative time: 47.05s
Epoch: 90 | cost: 1.233761 | Cumulative time: 53.08s
Epoch: 100 | cost: 1.330040 | Cumulative time: 58.31s
Training Skipgram Negative Done!: 58.31s


#### 4.3 Training GloVe

In [55]:
starttime = time.time()
for epoch in range(num_epochs):

    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(batch_size, corpus, skipgrams_glove, X_ik, weighting_dic)
    input_batch = torch.LongTensor(input_batch)          #(batch_size, 1)
    target_batch = torch.LongTensor(target_batch)        #(batch_size, 1)
    cooc_batch = torch.FloatTensor(cooc_batch)           #(batch_size, 1)
    weighting_batch = torch.FloatTensor(weighting_batch) #(batch_size, 1)

    optimizer_glove.zero_grad()
    loss = Glovemodel(input_batch, target_batch, cooc_batch, weighting_batch)

    loss.backward()
    optimizer_glove.step()
    
    if(epoch + 1)% 10==0:
        end = time.time()  
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | Cumulative time: {end-starttime :.4}s")
    
print(f"Training GloVe Done!: {time.time()-starttime:.4}s")

Epoch: 10 | cost: 47.381474 | Cumulative time: 1.522s
Epoch: 20 | cost: 12.463377 | Cumulative time: 3.022s
Epoch: 30 | cost: 0.662771 | Cumulative time: 4.591s
Epoch: 40 | cost: 3.059803 | Cumulative time: 6.57s
Epoch: 50 | cost: 18.158466 | Cumulative time: 8.425s
Epoch: 60 | cost: 1.316744 | Cumulative time: 10.34s
Epoch: 70 | cost: 0.876961 | Cumulative time: 12.37s
Epoch: 80 | cost: 0.101010 | Cumulative time: 14.28s
Epoch: 90 | cost: 3.905553 | Cumulative time: 16.22s
Epoch: 100 | cost: 0.721500 | Cumulative time: 18.06s
Training GloVe Done!: 18.06s


# 5. Comparison and Analysis

Compare Skip-gram, Skip-gram negative sampling, GloVe models on training loss, training time.
| Model | Training Loss | Training Time |
|----------|----------|----------|
| Skip-gram | 10.469149 | 56.19s |
|  Skip-gram negative sampling | 1.330040 | 58.31s |
| GloVe | 0.721500  | 18.06s |

#### 5.1 Load Dataset

Use Word analogies dataset to calucalte between syntactic and semantic accuracy, similar to the methods in the Word2Vec and GloVe paper.

In [41]:
# Download World Anology Dataset
import requests

url = "https://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt"
response = requests.get(url)

if response.status_code == 200:
    # Save the content to a file
    with open("word-test.v1.txt", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("Dataset downloaded successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Dataset downloaded successfully.


In [42]:
# Read the dataset
with open("word-test.v1.txt", "r") as file:
    dataset = file.read() 

In [43]:
# Preprocess the dataset
dataset = dataset.replace("\t", "")
dataset = dataset.split(": ")
dataset

['// Copyright 2013 Google Inc. All Rights Reserved.\n',
 'capital-common-countries\nAthens Greece Baghdad Iraq\nAthens Greece Bangkok Thailand\nAthens Greece Beijing China\nAthens Greece Berlin Germany\nAthens Greece Bern Switzerland\nAthens Greece Cairo Egypt\nAthens Greece Canberra Australia\nAthens Greece Hanoi Vietnam\nAthens Greece Havana Cuba\nAthens Greece Helsinki Finland\nAthens Greece Islamabad Pakistan\nAthens Greece Kabul Afghanistan\nAthens Greece London England\nAthens Greece Madrid Spain\nAthens Greece Moscow Russia\nAthens Greece Oslo Norway\nAthens Greece Ottawa Canada\nAthens Greece Paris France\nAthens Greece Rome Italy\nAthens Greece Stockholm Sweden\nAthens Greece Tehran Iran\nAthens Greece Tokyo Japan\nBaghdad Iraq Bangkok Thailand\nBaghdad Iraq Beijing China\nBaghdad Iraq Berlin Germany\nBaghdad Iraq Bern Switzerland\nBaghdad Iraq Cairo Egypt\nBaghdad Iraq Canberra Australia\nBaghdad Iraq Hanoi Vietnam\nBaghdad Iraq Havana Cuba\nBaghdad Iraq Helsinki Finland\nBa

In [44]:
# Extract the categories
capital_common_countries = [line.split(" ") for line in dataset[1].split("\n")[1:-1]]
past_tense = [line.split(" ") for line in dataset[12].split("\n")[1:-1]]
capital_common_countries[:5], past_tense[:5]

([['Athens', 'Greece', 'Baghdad', 'Iraq'],
  ['Athens', 'Greece', 'Bangkok', 'Thailand'],
  ['Athens', 'Greece', 'Beijing', 'China'],
  ['Athens', 'Greece', 'Berlin', 'Germany'],
  ['Athens', 'Greece', 'Bern', 'Switzerland']],
 [['dancing', 'danced', 'decreasing', 'decreased'],
  ['dancing', 'danced', 'describing', 'described'],
  ['dancing', 'danced', 'enhancing', 'enhanced'],
  ['dancing', 'danced', 'falling', 'fell'],
  ['dancing', 'danced', 'feeding', 'fed']])

#### 5.2 Syntactic and Semantic Accuracy

In [45]:
# Define the function to get embedding of all words in vocabs
def get_vocabs_embed(model, vocabs):
    embeddings = {}

    for word in vocabs:
        try:
            index = word2index[word]
        except:
            index = word2index["<UNK>"]

        word_tensor = torch.LongTensor([index])

        emb_center = model.embedding_center(word_tensor)
        emb_outside = model.embedding_outside(word_tensor)
        emb = (emb_center + emb_outside) / 2

        embeddings[word] = (emb[0][0].item(), emb[0][1].item())

    return embeddings

In [46]:
# Embedding of Skipgram, Skipgram Negative, Glove
Skipgram_embeddings = get_vocabs_embed(Skipgrammodel, vocabs)
SkipgramNeg_embeddings = get_vocabs_embed(SkipgramNegmodel, vocabs)
Glove_embeddings = get_vocabs_embed(Glovemodel, vocabs)
print(len(Skipgram_embeddings), len(SkipgramNeg_embeddings), len(Glove_embeddings))

9251 9251 9251


In [47]:
# Define the function to get the cosine similarity
def cosine_similarity(A,B):
    dot_product = np.dot(A,B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B) 
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [48]:
# Define the function to get the most similar word (Later used in Web App)
def get_most_similar_word(embeddings, word_vec, type, topn=10):
    similarities = {}

    if type == "cosine":
        for vocab, emb in embeddings.items():
            similarities[vocab] = cosine_similarity(word_vec, np.array(emb))
    elif type == "dot":
        for vocab, emb in embeddings.items():
            similarities[vocab] = np.dot(word_vec, np.array(emb))
    else:
        raise ValueError("Type must be either 'cosine' or 'dot'")

    similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    return similarities[1:topn+1]

In [49]:
# Define the function to get embedding of a word
def get_single_embed(embbeding_model,word):
    word = word.lower()
    try: 
        index = word2index[word]
    except: # for unknown word
        word = '<UNK>'

    return embbeding_model[word]

In [69]:
# Test the unknown word
word_vec_bkk = get_single_embed(Glove_embeddings,'Bangkok')
word_vec_bkk

(1.821340799331665, -0.5290414690971375)

In [70]:
# Test the unknown word
get_most_similar_word(Glove_embeddings, word_vec_bkk, "cosine", topn=10)

[('<UNK>', 0.9999999999999999),
 ('unsettled', 0.9999993256071421),
 ('nwor', 0.9999991022325584),
 ('winterhalter', 0.9999990348383608),
 ('picco', 0.9999990202466155),
 ('suggest', 0.9999984970948746),
 ('when', 0.9999983896774476),
 ('ownership', 0.9999982659472577),
 ('jobless', 0.9999979350807402),
 ('915', 0.9999971155957721)]

In [51]:
# Define the function to get the accuracy
def get_accuracy(embeddings, dataset, model="normal"):
    correct = 0
    total = 0
    
    for category in dataset:
        total += 1
        word_a, word_b, word_c, word_d = category[0].lower(), category[1].lower(), category[2].lower(), category[3].lower()
        if model == "normal":
            prediction = np.array(get_single_embed(embeddings, word_b)) - np.array(get_single_embed(embeddings, word_a)) + np.array(get_single_embed(embeddings, word_c))
            predicted_word = get_most_similar_word(embeddings, prediction, "cosine", 1)[0][0]
        elif model == "gensim":
            predicted_word=embeddings.most_similar(positive=[word_b, word_c], negative=[word_a], topn=1)[0][0]

        if predicted_word == word_d:
            correct += 1

    return correct / total
  

In [52]:
# Test the accuracy of Glove Gensim
get_accuracy(GloveGensimmodel, capital_common_countries, "gensim")

0.9387351778656127

Accuracy

In [71]:
# Test all the models accuracy
skipgram_syntatic = get_accuracy(Skipgram_embeddings, capital_common_countries)
skipgram_semantic = get_accuracy(Skipgram_embeddings, past_tense)
skipgramneg_syntatic = get_accuracy(SkipgramNeg_embeddings, capital_common_countries)
skipgramneg_semantic = get_accuracy(SkipgramNeg_embeddings, past_tense)
glove_syntatic = get_accuracy(Glove_embeddings, capital_common_countries)
glove_semantic = get_accuracy(Glove_embeddings, past_tense)
gensim_syntatic = get_accuracy(GloveGensimmodel, capital_common_countries, "gensim")
gensim_semantic = get_accuracy(GloveGensimmodel, past_tense, "gensim")

In [72]:
# Print the result
print("Skipgram Syntatic: ", skipgram_syntatic)
print("Skipgram Semantic: ", skipgram_semantic)
print("--------------------")
print("Skipgram Negative Syntatic: ", skipgramneg_syntatic)
print("Skipgram Negative Semantic: ", skipgramneg_semantic)
print("--------------------")
print("Glove Syntatic: ", glove_syntatic)
print("Glove Semantic: ", glove_semantic)
print("--------------------")
print("Gensim Syntatic: ", gensim_syntatic)
print("Gensim Semantic: ", gensim_semantic)


Skipgram Syntatic:  0.0
Skipgram Semantic:  0.000641025641025641
--------------------
Skipgram Negative Syntatic:  0.0
Skipgram Negative Semantic:  0.0
--------------------
Glove Syntatic:  0.0
Glove Semantic:  0.0
--------------------
Gensim Syntatic:  0.9387351778656127
Gensim Semantic:  0.5544871794871795


| Model               | Window Size | Training Loss | Syntactic Accuracy (%) | Semantic Accuracy (%) |
|---------------------|-------------|---------------|---------------------|-------------------|
| Skipgram            | 2           |  10.469149    |         0           |         0.000641025641025641         |
| Skipgram (NEG)      | 2           |  1.330040     |         0           |         0         |
| Glove               | 2           |   0.721500    |         0           |           0       |
| Glove (Gensim)      | -           |        -      |        93.8735             |       55.4487            |


#### 5.3 Correlation Similarity
 

In [73]:
# Open the wordsim_similarity_goldstandard.txt
import pandas as pd
with open("./wordsim_similarity_goldstandard.txt", "r") as file:
    wsg = pd.read_csv(file, sep="\t", names=["word1","word2","similarity_score"])
wsg

Unnamed: 0,word1,word2,similarity_score
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


Use the similarity dataset to find the correlation between models's dot product and the provided similarity metrics.

In [74]:
from scipy.stats import spearmanr

In [75]:
# Change all embeddings to numpy array (for more efficient computation)
Skipgram_embeddings_np = {key: np.array(value) for key, value in Skipgram_embeddings.items()}
SkipgramNeg_embeddings_np = {key: np.array(value) for key, value in SkipgramNeg_embeddings.items()}
Glove_embeddings_np = {key: np.array(value) for key, value in Glove_embeddings.items()}

In [76]:
# Check the uppercase in the wordsim_similarity_goldstandard.txt columns
print("before")
print("word1: ",any(wsg["word1"].str.isupper()))
print("word2: ",any(wsg["word2"].str.isupper()))

wsg['word1'] = wsg['word1'].str.lower()
wsg['word2'] = wsg['word2'].str.lower()
print("after")
print("word1: ",any(wsg["word1"].str.isupper()))
print("word2: ",any(wsg["word2"].str.isupper()))

before
word1:  False
word2:  True
after
word1:  False
word2:  False


Skip-Gram

In [77]:
# add the skipgram similarity score to the dataframe
wsg["Skipgram"] = wsg.apply(lambda x: np.dot(get_single_embed(Skipgram_embeddings,x['word1']),get_single_embed(SkipgramNeg_embeddings,x['word2'])), axis=1)

Skip-Gram with negative sampling

In [78]:
# add the skipgram negative similarity score to the dataframe
wsg["NEG"] = wsg.apply(lambda x: np.dot(get_single_embed(SkipgramNeg_embeddings,x['word1']),get_single_embed(SkipgramNeg_embeddings,x['word2'])), axis=1)

GloVe

In [79]:
# add the glove similarity score to the dataframe
wsg["GloVe"] = wsg.apply(lambda x: np.dot(get_single_embed(Glove_embeddings,x['word1']),get_single_embed(Glove_embeddings,x['word2'])), axis=1)

GloVe Gensim

In [80]:
# Test Glove Gensim Embedding
GloveGensimmodel['tiger']

array([-0.82157 ,  0.7952  ,  0.80816 , -0.09699 , -0.11838 ,  0.021151,
        0.62663 , -0.62621 , -0.602   , -0.76326 , -0.29244 ,  1.0718  ,
        1.0264  ,  0.51485 , -0.26717 ,  0.98914 , -0.13989 , -0.14031 ,
        0.068456,  0.64535 ,  0.25223 , -0.32911 ,  0.16927 ,  0.38475 ,
        0.41674 ,  1.097   , -0.66572 , -0.64471 , -0.12801 , -0.55854 ,
       -0.36549 ,  0.079815,  0.17361 ,  0.13237 ,  1.0031  , -0.50629 ,
       -0.84352 ,  0.8185  ,  0.41386 , -0.17179 , -0.49508 ,  0.61723 ,
        0.55838 ,  0.36077 ,  0.34123 ,  0.13034 , -0.11284 ,  0.4226  ,
       -0.67964 , -0.66582 , -1.1275  ,  0.16545 ,  0.028074,  0.59515 ,
        0.40081 , -1.5653  , -0.38159 ,  0.16342 ,  0.81675 ,  0.43633 ,
       -0.051347,  0.68191 , -0.66771 ,  0.94923 ,  0.24136 ,  0.64997 ,
        0.24963 ,  0.41191 ,  0.19212 ,  1.1599  , -0.089726, -0.55557 ,
       -0.64921 , -0.39078 , -0.65487 ,  0.031186,  0.073248,  0.45036 ,
       -0.83119 ,  1.1082  ,  0.29897 , -0.83352 , 

In [81]:
# add the glove(gensim) similarity score to the dataframe
wsg["GloVe(gensim)"] = wsg.apply(lambda x: np.dot(GloveGensimmodel[x['word1']],GloveGensimmodel[x['word2']]), axis=1)

In [82]:
# check the dataframe
wsg

Unnamed: 0,word1,word2,similarity_score,Skipgram,NEG,GloVe,GloVe(gensim)
0,tiger,cat,7.35,-0.013500,1.046953,-0.347262,15.629376
1,tiger,tiger,10.00,-0.243499,4.224055,3.597167,32.800144
2,plane,car,5.77,0.001743,0.218328,1.981475,24.047297
3,train,car,6.31,0.001743,0.218328,1.981475,25.472925
4,television,radio,6.77,0.371809,0.166541,0.189803,34.689987
...,...,...,...,...,...,...,...
198,rooster,voyage,0.62,-0.209303,3.022633,-0.921982,1.683646
199,noon,string,0.54,-0.243499,4.224055,3.597167,1.070592
200,chord,smile,0.54,-0.243499,4.224055,3.597167,6.762520
201,professor,cucumber,0.31,-1.292310,0.294015,1.441104,-0.230552


Calculate the spearman correlation

In [83]:
skipgram_correlation, p_value_s = spearmanr(wsg['similarity_score'], wsg['Skipgram'])
skipgramneg_correlation, p_value_sn = spearmanr(wsg['similarity_score'], wsg['NEG'])
glove_correlation, p_value_g = spearmanr(wsg['similarity_score'], wsg['GloVe'])
glovegensim_correlation, p_value_gs= spearmanr(wsg['similarity_score'], wsg['GloVe(gensim)'])
ytruecorrelation, p_value_yt = spearmanr(wsg['similarity_score'], wsg['similarity_score'])

In [84]:
# print the result
print("Skipgram: ", skipgram_correlation)
print("Skipgram Negative: ", skipgramneg_correlation)
print("Glove: ", glove_correlation)
print("Glove Gensim: ", glovegensim_correlation)
print("Ytrue: ", ytruecorrelation)

Skipgram:  0.16847053383501415
Skipgram Negative:  0.08786851950031294
Glove:  -0.012886622587326878
Glove Gensim:  0.5430870624672256
Ytrue:  1.0


| Model          | Skipgram |NEG|GloVe|GloVe(Gensim)|Y_true|
|----------------|----------------------|---------------|---------------|---------------|---------------|
| Spearman Correlation   | -0.06504503887196321 |0.04463185808838047 |-0.019138662738073235   |  0.5430870624672256  | 1.0  |


## 6. Inference

In [85]:
# Export all model as the pickle file
import pickle
with open('./A1_Engine_Search/code/SkipgramNeg_embeddings.pickle', 'wb') as handle:
    pickle.dump(SkipgramNeg_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./A1_Engine_Search/code/Skipgram_embeddings.pickle', 'wb') as handle:
    pickle.dump(Skipgram_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./A1_Engine_Search/code/Glove_embeddings.pickle', 'wb') as handle:
    pickle.dump(Glove_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [86]:
# Check all the pickle files
with open('./A1_Engine_Search/code/SkipgramNeg_embeddings.pickle', 'rb') as handle:
    a = pickle.load(handle)
with open('./A1_Engine_Search/code/Skipgram_embeddings.pickle', 'rb') as handle:
    b = pickle.load(handle)
with open('./A1_Engine_Search/code/Glove_embeddings.pickle', 'rb') as handle:
    c = pickle.load(handle)
print(len(a), len(b), len(c))   

9251 9251 9251
