In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from collections import Counter

## Load data

## 1. Load corpus

In [2]:
# import nltk
# # download nltk corpus
# nltk.download()

In [3]:
from nltk.corpus import brown
corpus = brown.sents()
# select only the first 1000 stories
corpus = corpus[:1000]
corpus = [[word.lower() for word in sent] for sent in corpus]

## 2. Numeralization

### find unique words

In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>
vocabs.append('<UNK>') #append unknown token to vocab

In [5]:
len(vocabs)

4273

In [6]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['dog']

1249

In [7]:
index2word = {v:k for k, v in word2index.items()}
index2word[10]

'can'

## Word2Vec

### 1. Prepare train data

In [8]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = []
            for j in range(i-window_size, i+window_size+1):
                outside.append(word2index[doc[j]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

### 3. Negative Sampling

#### Unigram distribution

In [9]:
z = 0.001

In [10]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

22079

In [11]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

### 4. Model

In [12]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [13]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [14]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [15]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

### 3. Training

In [16]:
# if torch.cuda.is_available():  
#   dev = "cuda:0" 
# else:  
#   dev = "cpu"  
# device = torch.device(dev) 
device = torch.device("cpu")
device

device(type='cpu')

In [17]:
torch.manual_seed(42)
atch_size = 2
emb_size   = 2
batch_size = 2
window_size = 2
voc_size   = len(vocabs)

In [18]:
#prepare all vocabs
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size).to(device)
all_vocabs

tensor([[   0,    1,    2,  ..., 4270, 4271, 4272],
        [   0,    1,    2,  ..., 4270, 4271, 4272]])

#### Skipgram Training

In [19]:
import time

In [20]:
skipgram_model  = Skipgram(voc_size, emb_size).to(device)
optimizer  = optim.Adam(skipgram_model.parameters(), lr=0.001)
num_epochs = 10
start_time = time.time()

for epoch in range(num_epochs):
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    loss = skipgram_model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

print(f"Training time: {time.time()-start_time}")

  from .autonotebook import tqdm as notebook_tqdm


Epoch      1 | Loss: 9.164570
Epoch      2 | Loss: 10.149135
Epoch      3 | Loss: 7.733615
Epoch      4 | Loss: 8.794529
Epoch      5 | Loss: 9.012422
Epoch      6 | Loss: 8.760987
Epoch      7 | Loss: 8.126495
Epoch      8 | Loss: 9.138553
Epoch      9 | Loss: 8.062308
Epoch     10 | Loss: 11.005460
Training time: 3.1809864044189453


#### Neg Sampling Training

In [21]:
neg_model   = SkipgramNeg(voc_size, emb_size).to(device)
optimizer  = optim.Adam(neg_model.parameters(), lr=0.001)
num_epochs = 10
k = 5
start_time = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k).to(device)
    loss = neg_model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

print(f"Training time: {time.time()-start_time}")

Epoch      1 | Loss: 1.934021
Epoch      2 | Loss: 2.607116
Epoch      3 | Loss: 1.133654
Epoch      4 | Loss: 2.830342
Epoch      5 | Loss: 2.534127
Epoch      6 | Loss: 1.554465
Epoch      7 | Loss: 1.084432
Epoch      8 | Loss: 1.945701
Epoch      9 | Loss: 0.972715
Epoch     10 | Loss: 1.252883
Training time: 2.999988555908203


## GloVe from Scratch

### 1. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 2.

In [22]:
from collections import Counter

X_i = Counter(flatten(corpus))

In [23]:
skip_grams = []

for doc in corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skip_grams.append((center, each_out))

In [24]:
X_ik_skipgrams = Counter(skip_grams)

#### Weighting function

GloVe includes a weighting function to scale down too frequent words.

In [25]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [26]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

### 2. Prepare train data

In [27]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### 3. Model

In [28]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [29]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
glove_model = Glove(voc_size, embedding_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(glove_model.parameters(), lr=0.001)

In [30]:
import time

# Training
num_epochs = 10
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = glove_model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()

    print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")
    
print(f"Training time: {time.time()-start_time}")

Epoch: 1 | cost: 95.865410
Epoch: 2 | cost: 14.577272
Epoch: 3 | cost: 12.333422
Epoch: 4 | cost: 2.934718
Epoch: 5 | cost: 9.889151
Epoch: 6 | cost: 13.056210
Epoch: 7 | cost: 44.128540
Epoch: 8 | cost: 4.856674
Epoch: 9 | cost: 10.214202
Epoch: 10 | cost: 2.058057
Training time: 59.404550552368164


## GloVe (Gensim)

In [31]:
import os
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath(os.path.abspath('glove.6B.100d.txt'))  #search on the google
gensim_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

## Semantic and Syntatic

In [32]:
def compute_embeddings(model, vocabs):
    embeds = {}

    for word in vocabs:
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word_idx = torch.LongTensor([word2index[word]])
        
        embed_c = model.embedding_center(word_idx)
        embed_o = model.embedding_outside(word_idx)
        embed   = (embed_c + embed_o) / 2
        embed = embed[0][0].item(), embed[0][1].item()
        embeds[word] = np.array(embed)
    
    return embeds

In [33]:
def get_embed(embeddings, word):
    try:
        index = word2index[word]
    except:
        word = '<UNK>'
    
    return embeddings[word]

In [34]:
# find the embeddings from each of our model
skipgram_embeds = compute_embeddings(skipgram_model, vocabs)
neg_embeds = compute_embeddings(neg_model, vocabs)
glove_embeds = compute_embeddings(glove_model, vocabs)

In [35]:
embeds_dict = {
    "skipgram_embeds": skipgram_embeds,
    "neg_embeds": neg_embeds,
    "glove_embeds": glove_embeds
}

for embeds in embeds_dict.items():
    with open(f"app/embeddings/{embeds[0]}.pickle", "wb") as f:
        pickle.dump(embeds[1], f)

In [36]:
get_embed(neg_embeds, 'greece')

array([-0.28234041,  0.20191008])

In [37]:
# read the analogy dataset
with open("word-test.v1.txt", "r") as f:
    data = f.read()

data = data.replace("\t", "")
# split the dataset based on their categories
analogy = data.split(': ')

In [38]:
# select the 'capital-common-countries' section of the dataset
capital = analogy[1].split('\n')[1:-1]
capital = [x.split(" ") for x in capital]
capital[:5]

[['Athens', 'Greece', 'Baghdad', 'Iraq'],
 ['Athens', 'Greece', 'Bangkok', 'Thailand'],
 ['Athens', 'Greece', 'Beijing', 'China'],
 ['Athens', 'Greece', 'Berlin', 'Germany'],
 ['Athens', 'Greece', 'Bern', 'Switzerland']]

In [39]:
# select the 'gram7-past-tense' section of the dataset
past_tense = analogy[12].split('\n')[1:-1]
past_tense = [x.split(" ") for x in past_tense]
past_tense[:5]

[['dancing', 'danced', 'decreasing', 'decreased'],
 ['dancing', 'danced', 'describing', 'described'],
 ['dancing', 'danced', 'enhancing', 'enhanced'],
 ['dancing', 'danced', 'falling', 'fell'],
 ['dancing', 'danced', 'feeding', 'fed']]

In [40]:
capital[1]

['Athens', 'Greece', 'Bangkok', 'Thailand']

In [41]:
# Greece - Athens + Bangkok
# ground-truth == y_true == 'Thailand'
i = 1
y_pred = get_embed(neg_embeds, capital[i][1].lower()) - get_embed(neg_embeds, capital[i][0].lower()) + get_embed(neg_embeds, capital[i][2].lower())
y_pred

array([-0.28234041,  0.20191008])

In [42]:
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [43]:
# function to find the most similar word to the input vector
def get_most_similar(vector, embeddings):
    # retrieve all words in our embeddings vocabs
    try:
        words = list(embeddings.keys())
    except:
        words = list(embeddings.key_to_index.keys())
    
    similarities = {}

    # for each word in the vocabs, find the cosine similarities between word vectors in our embeddings and the input vector
    for word in words:
        similarities[word] = cosine_similarity(vector, embeddings[word])

    # return the word with the most similar vector to the input vector
    return max(similarities, key=similarities.get)

In [44]:
# function to find the most similar word to the input vector
def cosine_ranking(vector, embeddings):
    # retrieve all words in our embeddings vocabs
    try:
        words = list(embeddings.keys())
    except:
        words = list(embeddings.key_to_index.keys())
    
    similarities = {}

    # for each word in the vocabs, find the cosine similarities between word vectors in our embeddings and the input vector
    for word in words:
        similarities[word] = cosine_similarity(vector, embeddings[word])

    # return the word with the most similar vector to the input vector
    # return similarities
    return dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))

In [45]:
# function for finding semantic and syntactic accuracies
def find_accuracy(dataset, embeddings):
    matched_count = 0

    for data in dataset:
        row = [word.lower() for word in data]
        
        # find the predicted vector
        try:
            pred_y = get_embed(embeddings, row[1]) - get_embed(embeddings, row[0]) + get_embed(embeddings, row[2])
            pred_word = get_most_similar(pred_y, embeddings)
        except:
            pred_word = embeddings.most_similar(positive=[row[1], row[2]], negative=[row[0]])[0][0]

        # if the ground-truth word matched with the word where its corresponding vector are the closest to the predicted vector, increase matched_count
        if row[3] == pred_word:
            matched_count += 1

    # count of matched / count of all as accuracy
    return matched_count / len(dataset)

In [46]:
skipgram_sem = find_accuracy(capital, skipgram_embeds)
skipgram_syn = find_accuracy(past_tense, skipgram_embeds)

In [47]:
neg_sem = find_accuracy(capital, neg_embeds)
neg_syn = find_accuracy(past_tense, neg_embeds)

In [48]:
glove_sem = find_accuracy(capital, glove_embeds)
glove_syn = find_accuracy(past_tense, glove_embeds)

In [49]:
gensim_sem = find_accuracy(capital, gensim_model)
gensim_syn = find_accuracy(past_tense, gensim_model)

In [50]:
print("=== Word2Vec (Skipgram) ===")
print(f"Semantic accuracy: {skipgram_sem}")
print(f"Syntatic accuracy: {skipgram_syn}\n")

print("=== Word2Vec (Negative Sampling) ===")
print(f"Semantic accuracy: {neg_sem}")
print(f"Syntatic accuracy: {neg_syn}\n")

print("=== GloVe from Scratch ===")
print(f"Semantic accuracy: {glove_sem}")
print(f"Syntatic accuracy: {glove_syn}\n")

print("=== GloVe (Gensim) ===")
print(f"Semantic accuracy: {gensim_sem}")
print(f"Syntatic accuracy: {gensim_syn}")

=== Word2Vec (Skipgram) ===
Semantic accuracy: 0.0
Syntatic accuracy: 0.000641025641025641

=== Word2Vec (Negative Sampling) ===
Semantic accuracy: 0.0
Syntatic accuracy: 0.0

=== GloVe from Scratch ===
Semantic accuracy: 0.001976284584980237
Syntatic accuracy: 0.0

=== GloVe (Gensim) ===
Semantic accuracy: 0.9387351778656127
Syntatic accuracy: 0.5064102564102564


## Similarity Correlation

In [51]:
import pandas as pd

# load word similarity dataset as pandas dataframe
wordsim = pd.read_csv('wordsim_similarity_goldstandard.txt', sep="\t", header=None, names=['word_1', 'word_2', 'similarities'])
wordsim

Unnamed: 0,word_1,word_2,similarities
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [52]:
wordsim['SKIP_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(skipgram_embeds, row['word_1'].lower()), get_embed(skipgram_embeds, row['word_2'].lower())
    ), axis=1)

wordsim['NEG_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(neg_embeds, row['word_1'].lower()), get_embed(neg_embeds, row['word_2'].lower())
    ), axis=1)

wordsim['glove_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(glove_embeds, row['word_1'].lower()), get_embed(glove_embeds, row['word_1'].lower())
    ), axis=1)

wordsim['gensim_dot_product'] = wordsim.apply(lambda row: np.dot(
    gensim_model[row['word_1'].lower()], gensim_model[row['word_2'].lower()]
    ), axis=1)

wordsim

Unnamed: 0,word_1,word_2,similarities,SKIP_dot_product,NEG_dot_product,glove_dot_product,gensim_dot_product
0,tiger,cat,7.35,4.754888,0.352016,0.432315,15.629377
1,tiger,tiger,10.00,4.754888,0.352016,0.432315,32.800144
2,plane,car,5.77,-0.630410,-0.569838,0.432315,24.047297
3,train,car,6.31,-0.630410,-0.569838,0.432315,25.472925
4,television,radio,6.77,-0.130191,-0.196611,0.097703,34.689987
...,...,...,...,...,...,...,...
198,rooster,voyage,0.62,4.754888,0.352016,0.432315,1.683646
199,noon,string,0.54,-3.843663,-0.828806,0.298204,1.070593
200,chord,smile,0.54,4.754888,0.352016,0.432315,6.762520
201,professor,cucumber,0.31,2.144881,0.626350,0.348931,-0.230552


In [54]:
from scipy.stats import spearmanr

# finding spearman correlations between wordsim353 similarities and our embeddings dot products
wordsim_sim = wordsim['similarities'].to_numpy()
skipgram_sim = wordsim['SKIP_dot_product'].to_numpy()
neg_sim = wordsim['NEG_dot_product'].to_numpy()
glove_sim = wordsim['glove_dot_product'].to_numpy()
gensim_sim = wordsim['gensim_dot_product'].to_numpy()

print("=== Spearman correlations ===")
print(f"Word2Vec (Skipgram): {spearmanr(wordsim_sim, skipgram_sim).statistic}")
print(f"Word2Vec (Negative Sampling): {spearmanr(wordsim_sim, neg_sim).statistic}")
print(f"GloVe from Scratch: {spearmanr(wordsim_sim, glove_sim).statistic}")
print(f"GloVe (Gensim): {spearmanr(wordsim_sim, gensim_sim).statistic}")

=== Spearman correlations ===
Word2Vec (Skipgram): 0.0818425236423009
Word2Vec (Negative Sampling): 0.09041614439553024
GloVe from Scratch: 0.03145212607529534
GloVe (Gensim): 0.5430870624672256


<h4>Model Accuracies and Training Time Comparison</h4>

| **Model**          | **Window Size** | **Training Loss** | **Training time** | **Syntactic Accuracy** | **Semantic accuracy** |
|--------------------|:---------------:|:-----------------:|:-----------------:|:----------------------:|:---------------------:|
| **Skipgram**       |        2        |      11.0055      |       3.18 s      |          0.06%         |           0%          |
| **Skipgram (NEG)** |        2        |       1.2529      |       2.99 s      |           0%           |           0%          |
| **GloVe**          |        2        |       2.0581      |      59.40 s      |           0%           |          0.2%         |
| **GloVe (Gensim)** |        -        |         -         |         -         |         50.64%         |         93.87%        |

<h4>Correlation between Model Dot Product and Score by Human Judgement</h4>

| **Model**                | **Skipgram** | **NEG** | **GloVe** | **GloVe (gensim)** |
|--------------------------|--------------|---------|-----------|--------------------|
| **Spearman Correlation** |    0.0818    |  0.0904 |   0.0315  |       0.5431       |