In [1]:
# load necessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import json
import math
import time
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

# Task 1

## 1. Load data

In [2]:
# load categories news data from nltk
import nltk
nltk.download('brown')
from nltk.corpus import brown 
corpus = brown.sents(categories=['news'])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\earth\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
# the corpus is alreary tokenize 
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [4]:
# numeralization
flatten = lambda l: [item for sublist in l for item in sublist]

# create to keep the all unique word in corpus on list
vocabs = list(set(flatten(corpus))) 
# vocabs

['liens',
 "Hollywood's",
 'editing',
 'relief',
 'less',
 'endeared',
 '100,000',
 'Bernardine',
 'boys',
 'leaving',
 'followed',
 'Caucusing',
 'baggage',
 "Authority's",
 'Corporation',
 'Springfield',
 'obtain',
 'fittest',
 'intangibles',
 'encounters',
 'Gift',
 'intercepted',
 'father',
 'peddler',
 'Write',
 'understandable',
 'Journal-Bulletin',
 'Meet',
 'study',
 'non-farm',
 'Rickards',
 'agreeing',
 'potato',
 'healed',
 'pushing',
 'bloodstream',
 'receptive',
 '1,257,700',
 'monks',
 'formerly',
 'Casals',
 'recommendation',
 'Mom',
 'listening',
 'polled',
 'count',
 'stuffed',
 'domination',
 "football's",
 'bonds',
 'burglarproof',
 'correspondents',
 'campaigning',
 'lazy',
 'neatly',
 'solo',
 'racket',
 'McAlester',
 'Sheldon',
 'occupying',
 'danger',
 'speck',
 'Moller',
 'whiz',
 '3-10',
 'peddle',
 'bans',
 'welcomed',
 'central',
 'Centredale',
 "Bucs'",
 'Cable',
 "shop's",
 'crossroads',
 'switching',
 'bleacher-type',
 'Giacometti',
 '1910',
 'ghastly',
 '

In [5]:
# save the vocabs list to a JSON file for using it on deployment part
with open('app/vocabs_list', 'w') as json_file1:
    json.dump(vocabs, json_file1)

In [6]:
# create dictionary for converting word to integer 
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['meet']

2427

In [7]:
# add unknown to the vocabs list and convert to integer
word2index['<UNK>'] = len(vocabs)
vocabs.append('<UNK>')

In [8]:
# save the dictionary to a JSON file
with open('app/word2index_dict', 'w') as json_file2:
    json.dump(word2index, json_file2)

In [9]:
# create dictionary for converting integer to word
index2word = {v:k for k, v in word2index.items()}
index2word[0]

'liens'

## 2. Prepare train data

Create random_batch function with window size = 2 to generate the pairs of center word, and outside word

### Word2Vec

In [10]:
def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        # since we assign window size = 2
        # look from the third word until third last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 4 words (2 words from left and 2 words from right)
            outside = (word2index[doc[i-2]],word2index[doc[i-1]], word2index[doc[i+1]],word2index[doc[i+2]])
            #for each of these 4 outside words, we need to append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], [] #inputs = center word, labels = outside word 
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

# test the random_batch function            
x, y = random_batch(2, corpus)


In [11]:
x #shape = (batch_size,)

array([[9000],
       [9969]])

In [12]:
y  #shape = (batch_size,)

array([[11077],
       [ 9598]])

### GloVe

In [13]:
# count the number of each pair of words

from collections import Counter

skipgrams = []

for doc in corpus:
    # since we assign window size = 2
    # look from the third word until third last word
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skipgrams.append((center, each_out))

X_ik_skipgrams = Counter(skipgrams)
# X_ik_skipgrams

In [14]:
# create weighting function to scale down of too frequent words
def weighting(w_i, w_j, X_ik):
    
    #check the pair of (w_i, w_j) is on co-occurences or not
    try:
        x_ij = X_ik[(w_i, w_j)]

    #if not exist, then set to 1 because the pair of words may be occured in the future
    except:
        x_ij = 1

    #set according from GloVe paper    
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [15]:
from itertools import combinations_with_replacement

X_ik = {} #dict for counting the number of pair of words
weighting_dic = {} #scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2): #bigram result = all possible 2 pair of words
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically is, student = student, is
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

In [16]:
def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) 
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] 
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [17]:
#test the random_batch_glove function 
batch_size = 2
x_glove, y_glove, cooc_glove, weighting_glove = random_batch_glove(batch_size, corpus, skipgrams, X_ik, weighting_dic)
print("x",x_glove)
print("y",y_glove)
print("cooc",cooc_glove)
print("weighting",weighting_glove)

x [[13844]
 [ 8735]]
y [[12735]
 [ 9598]]
cooc [[4.99721227]
 [0.69314718]]
weighting [[1.        ]
 [0.05318296]]


## 3. Model

In [18]:
voc_size   = len(vocabs)
emb_size = 2

In [19]:
#prepare all vocabs

batch_size = 2

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14392, 14393, 14394],
        [    0,     1,     2,  ..., 14392, 14393, 14394]])

### 3.1 Word2Vec (Skipgram)


In [20]:
# create skipgram model
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.center_embedding(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.center_embedding(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.center_embedding(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)) # bmm is dot product (ignore batch size) and reduce dim to 2 
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        #calculate loss
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  
        
        return loss
        

In [21]:
# test skipgram model
model_skipgram = Skipgram(voc_size, 2)
model_skipgram

Skipgram(
  (center_embedding): Embedding(14395, 2)
  (outside_embedding): Embedding(14395, 2)
)

In [22]:
x_skipgram, y_skipgram = random_batch(batch_size, corpus)
x_tensor_skipgram = torch.LongTensor(x_skipgram)
y_tensor_skipgram = torch.LongTensor(y_skipgram)

In [23]:
loss_skipgram = model_skipgram(x_tensor_skipgram, y_tensor_skipgram, all_vocabs)
loss_skipgram

tensor(7.3371, grad_fn=<NegBackward0>)

### 3.2 Word2Vec (Negative sampling)

#### Unigram distribution
$$P(w)=U(w)^{3/4}/Z$$

In [24]:
from collections import Counter

word_count = Counter(flatten(corpus))

#count the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

100554

In [25]:
# word_count

In [26]:
# assign z to 0.001
z = 0.001

In [27]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 114,
         ',': 108,
         '.': 89,
         'of': 69,
         'and': 55,
         'to': 55,
         'a': 52,
         'in': 50,
         'for': 30,
         'The': 26,
         'that': 26,
         'was': 24,
         "''": 24,
         '``': 24,
         'is': 24,
         'on': 22,
         'at': 21,
         'with': 19,
         'be': 19,
         'as': 18,
         'by': 18,
         'he': 17,
         'said': 15,
         'his': 15,
         'will': 15,
         'it': 14,
         'from': 14,
         ';': 13,
         'are': 13,
         'an': 12,
         'has': 12,
         '--': 12,
         'had': 12,
         'not': 11,
         'who': 11,
         'this': 11,
         'have': 11,
         'Mrs.': 11,
         'were': 11,
         'their': 10,
         'would': 10,
         'which': 10,
         'been': 9,
         'He': 9,
         'they': 9,
         ')': 8,
         'its': 8,
         'out': 8,
         '(': 8,
         'one': 8,
         'up': 8,

In [28]:
import random
# sample 5 words on corpus
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [29]:
x_neg, y_neg = random_batch(batch_size, corpus)
x_tensor_neg = torch.LongTensor(x_neg)
y_tensor_neg = torch.LongTensor(y_neg)

In [30]:
k = 5
# test negative_sampling
neg_samples = negative_sampling(y_tensor_neg, unigram_table, k)

In [31]:
y_tensor_neg[1]

tensor([9143])

In [32]:
neg_samples[1]

tensor([ 6946, 14111, 11170,  5759,  4408])

In [33]:
# create skipgram negative sampling
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.center_embedding(center) #(bs, 1, emb_size)
        outside_embed  = self.outside_embedding(outside) #(bs, 1, emb_size)
        negative_embed = self.outside_embedding(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1) #sum on second dim
        
        # calculate loss
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [34]:
#test your model
model_test_neg = SkipgramNeg(voc_size, emb_size)

In [35]:
loss_neg = model_test_neg(x_tensor_neg, y_tensor_neg, neg_samples)
loss_neg

tensor(0.7951, grad_fn=<NegBackward0>)

### 3.3 GloVe

In [36]:
# create glove model
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        # calculate loss
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [37]:
# test glove model
model_glove = Glove(voc_size, emb_size)

In [38]:
# convert to tensor
x_tensor_glove = torch.LongTensor(x_glove)
y_tensor_glove = torch.LongTensor(y_glove)
cooc_tensor_glove = torch.FloatTensor(cooc_glove)
weighting_tensor_glove = torch.FloatTensor(weighting_glove)

In [39]:
# print loss
loss_glove = model_glove(x_tensor_glove, y_tensor_glove, cooc_tensor_glove, weighting_tensor_glove)
loss_glove

tensor(45.3896, grad_fn=<SumBackward0>)

## 4. Training

In [40]:
# function for calculating the training time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### 4.1 Word2Vec (Skipgram)

In [41]:
model_skipgram      = Skipgram(voc_size, emb_size)
optimizer_skipgram  = optim.Adam(model_skipgram.parameters(), lr=0.001)

In [42]:
# Training
num_epochs = 2000
start = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch_skipgram, label_batch_skipgram = random_batch(batch_size, corpus)
    input_tensor_skipgram = torch.LongTensor(input_batch_skipgram)
    label_tensor_skipgram = torch.LongTensor(label_batch_skipgram)
     
    #predict
    loss_skipgram = model_skipgram(input_tensor_skipgram, label_tensor_skipgram, all_vocabs)
    
    #backprogate
    optimizer_skipgram.zero_grad()
    loss_skipgram.backward()
    
    #update alpha
    optimizer_skipgram.step()
    
    #print the loss
    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss_skipgram:2.6f}") #Epoch 6 front space, 0 back space

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")    

Epoch    200 | Loss: 9.322708
Epoch    400 | Loss: 10.832323
Epoch    600 | Loss: 10.628942
Epoch    800 | Loss: 10.641950
Epoch   1000 | Loss: 9.970019
Epoch   1200 | Loss: 10.455753
Epoch   1400 | Loss: 9.368336
Epoch   1600 | Loss: 10.596554
Epoch   1800 | Loss: 9.585976
Epoch   2000 | Loss: 10.171299
time: 15m 56s


In [43]:
# save the skipgram model
torch.save(model_skipgram, 'skipgram_model.pth')

### 4.2 Word2Vec (Negative sampling)

In [44]:
model_neg     = SkipgramNeg(voc_size, emb_size)
optimizer_neg = optim.Adam(model_neg.parameters(), lr=0.001)

In [45]:
# Training
num_epochs = 2000
start = time.time()
for epoch in range(num_epochs):
    
    #get batch
    input_batch_neg, label_batch_neg = random_batch(batch_size, corpus)
    input_tensor_neg = torch.LongTensor(input_batch_neg)
    label_tensor_neg = torch.LongTensor(label_batch_neg)
    
    #predict
    neg_samples = negative_sampling(label_tensor_neg, unigram_table, k)
    loss_neg = model_neg(input_tensor_neg, label_tensor_neg, neg_samples)
    
    #backprogate
    optimizer_neg.zero_grad()
    loss_neg.backward()
    
    #update alpha
    optimizer_neg.step()
    
    #print the loss
    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss_neg:2.6f}")

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")

Epoch    200 | Loss: 7.423603
Epoch    400 | Loss: 6.577616
Epoch    600 | Loss: 0.897706
Epoch    800 | Loss: 3.228155
Epoch   1000 | Loss: 1.562261
Epoch   1200 | Loss: 1.186448
Epoch   1400 | Loss: 0.796024
Epoch   1600 | Loss: 2.623813
Epoch   1800 | Loss: 1.790620
Epoch   2000 | Loss: 2.019679
time: 15m 52s


In [46]:
# save the skipgram negative sampling model
torch.save(model_neg, 'skipgramNEG_model.pth')

### 4.3 GloVe

In [47]:
model_glove = Glove(voc_size, emb_size)
criterion = nn.CrossEntropyLoss()
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.001)

In [48]:
# Training
num_epochs = 2000
start = time.time()
for epoch in range(num_epochs):
        
    input_batch_glove, target_batch_glove, cooc_batch_glove, weighting_batch_glove = random_batch_glove(batch_size, corpus, skipgrams, X_ik, weighting_dic)
    input_batch_glove  = torch.LongTensor(input_batch_glove)         #[batch_size, 1]
    target_batch_glove = torch.LongTensor(target_batch_glove)        #[batch_size, 1]
    cooc_batch_glove   = torch.FloatTensor(cooc_batch_glove)         #[batch_size, 1]
    weighting_batch_glove = torch.FloatTensor(weighting_batch_glove) #[batch_size, 1]
    
    optimizer_glove.zero_grad()
    loss_glove = model_glove(input_batch_glove, target_batch_glove, cooc_batch_glove, weighting_batch_glove)
    
    loss_glove.backward()
    optimizer_glove.step()

    if (epoch + 1) % 200 == 0:
        print(f"Epoch: {epoch + 1:6.0f} | Loss: {loss_glove:2.6f}")

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")


Epoch:    200 | Loss: 37.209587
Epoch:    400 | Loss: 10.090467
Epoch:    600 | Loss: 41.918808
Epoch:    800 | Loss: 1.139659
Epoch:   1000 | Loss: 0.094033
Epoch:   1200 | Loss: 1.894140
Epoch:   1400 | Loss: 0.620077
Epoch:   1600 | Loss: 3.511661
Epoch:   1800 | Loss: 0.629111
Epoch:   2000 | Loss: 0.557695
time: 2m 19s


In [49]:
torch.save(model_glove, 'glove_model.pth')

# Task 2

#### Task2.1 Compare training loss and training time of Skip-gram, Skip-gram negative sampling, and GloVe models 

| Model     | training loss @ epoch 2000 | training time| 
|:----------|:---------------------------:|:-------------:|
| Skipgram  |       10.171299             |    15m 56s     |  
| Skipgram (NEG) |  2.019679              |    15m 52s    | 
| Glove     |       0.557695            |    2m 19s     |


#### Task2.2 Compare and calculatesyntactic and semantic accuracy of Skip-gram, Skip-gram negative sampling, GloVe , and GloVe(Gensim) models 

In [50]:
semantic_data = []
with open('data/capital-common-countries.txt') as file1:
    for line in file1:
        semantic_data.append(line.split())
# semantic_data[:5]

In [51]:
syntactic_data = []
with open('data/past-tense.txt') as file2:
    for line in file2:
        syntactic_data.append(line.split())
# syntactic_data[:5]

In [52]:
def analogy(model,x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [53]:
def count_analogy(model,data):
    count = 0
    for i in range(len(data)):
        try:
            word = analogy(model,data[i][0],data[i][1] , data[i][2])
            if word == data[i][3]:
                count += 1
        except:
            continue
    return count/len(data)
    

### Word2Vec (Skipgram)

In [54]:
#load the saved glove model

loaded_skipgram_model=torch.load('skipgram_model.pth')
loaded_skipgram_model.eval()  

#extract the word vectors from the loaded model
word_vectors_skipgram = loaded_skipgram_model.center_embedding.weight.data.numpy()
word_vectors_skipgram
#save the word vectors 
np.savetxt('skipgram_word_vectors.txt', word_vectors_skipgram, delimiter=' ')

#add word along with word vectors
with open('skipgram_word_vectors.txt', 'w', encoding='utf-8') as f:
    for i, word in enumerate(vocabs):
        vector_skipgram = " ".join(str(value) for value in word_vectors_skipgram[i])
        f.write(f"{word} {vector_skipgram}\n")

In [55]:
skipgram_model = KeyedVectors.load_word2vec_format('skipgram_word_vectors.txt', binary=False, no_header=True)

In [56]:
count_analogy(skipgram_model,semantic_data)

0.0

In [57]:
count_analogy(skipgram_model,syntactic_data)

0.0

### Word2Vec (Negative sampling)

In [58]:
#load the saved glove model
loaded_neg_model=torch.load('skipgram_model.pth')
loaded_neg_model.eval()  


#extract the word vectors from the loaded model
word_vectors_neg = loaded_neg_model.center_embedding.weight.data.numpy()
word_vectors_neg
#save the word vectors 
np.savetxt('skipgramNEG_word_vectors.txt', word_vectors_neg, delimiter=' ')

#add word along with word vectors
with open('skipgramNEG_word_vectors.txt', 'w', encoding='utf-8') as f:
    for i, word in enumerate(vocabs):
        vector_neg = " ".join(str(value) for value in word_vectors_neg[i])
        f.write(f"{word} {vector_neg}\n")

In [59]:
neg_model = KeyedVectors.load_word2vec_format('skipgramNEG_word_vectors.txt', binary=False, no_header=True)

In [60]:
count_analogy(neg_model,semantic_data)

0.0

In [61]:
count_analogy(neg_model,syntactic_data)

0.0

### GloVe

In [62]:
#load the saved glove model
loaded_glove_model = torch.load('glove_model.pth')
loaded_glove_model.eval()

#extract the word vectors from the loaded model
word_vectors_glove = loaded_glove_model.center_embedding.weight.data.numpy()
word_vectors_glove
#save the word vectors 
np.savetxt('glove_word_vectors.txt', word_vectors_glove, delimiter=' ')

#add word along with word vectors
with open('glove_word_vectors.txt', 'w', encoding='utf-8') as f:
    for i, word in enumerate(vocabs):
        vector = " ".join(str(value) for value in word_vectors_glove[i])
        f.write(f"{word} {vector}\n")


In [63]:
glove_model = KeyedVectors.load_word2vec_format('glove_word_vectors.txt', binary=False, no_header=True)


In [64]:
count_analogy(glove_model,semantic_data)

0.0

In [65]:
count_analogy(glove_model,syntactic_data)

0.0

### GloVe (Gensim)

In [66]:

gensim_model = KeyedVectors.load_word2vec_format('data/glove.6B.100d.txt', binary=False, no_header=True)

In [67]:
count_analogy(gensim_model,semantic_data)

0.0

In [68]:
count_analogy(gensim_model,syntactic_data)

0.5544871794871795

| Model     | Window Size | Syntactic Accuracy| Semantic Accuracy|
|:----------|:-----------:|:-----------------:|:----------------:|
| Skipgram  |       2          |    0.0     |  0.0
| Skipgram (NEG) |  2          |    0.0     |  0.0
| Glove     |      2           |    0.0     |  0.0
| Glove (Gensim)|   -          |    0.5545  | 0.0

#### Task2.3 Find the correlation between models dot product and human judgement

In [69]:
from scipy.stats import spearmanr

In [70]:
# load data and add to test_data list
test_data = []
with open('data/wordsim353.txt') as file4:
    for line in file4:
        test_data.append(line.split())

In [71]:
# create list for keeping human judgment score
human_judgment = []

for l in test_data:
    human_judgment.append(l[3])


In [72]:
#let's write a function to get embedding given a word
def get_embed(model,word):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.center_embedding(id_tensor)
    u_embed = model.outside_embedding(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [73]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

### Word2Vec (Skipgram)

In [74]:
skipgram_model = torch.load('skipgram_model.pth')
skipgram_model.eval()

Skipgram(
  (center_embedding): Embedding(14395, 2)
  (outside_embedding): Embedding(14395, 2)
)

In [76]:
# calculate cosine similarity of 2 words
skipgram_similarity = []
for line in test_data:
    if line[1] in vocabs and line[2] in vocabs:
        v = cos_sim(get_embed(skipgram_model,line[1]), get_embed(skipgram_model,line[2]))
        skipgram_similarity.append(v)
    else:
        skipgram_similarity.append(0)

# skipgram_similarity

In [77]:
#print correlation value
c1 = spearmanr(human_judgment, skipgram_similarity)
c1.statistic

-0.052644236330194674

### Word2Vec (Negative sampling)

In [78]:
skipgramNeg_model = torch.load('skipgramNEG_model.pth')
skipgramNeg_model.eval()

SkipgramNeg(
  (center_embedding): Embedding(14395, 2)
  (outside_embedding): Embedding(14395, 2)
  (logsigmoid): LogSigmoid()
)

In [79]:
# calculate cosine similarity of 2 words
skipgramNEG_similarity = []
for line in test_data:
    if line[1] in vocabs and line[2] in vocabs:
        v = cos_sim(get_embed(skipgramNeg_model,line[1]), get_embed(skipgramNeg_model,line[2]))
        skipgramNEG_similarity.append(v)
    else:
        skipgramNEG_similarity.append(0)

# skipgramNEG_similarity

In [80]:
#print correlation value
c2 = spearmanr(human_judgment, skipgramNEG_similarity)
c2.statistic

0.0996954628699487

### GloVe

In [83]:
glove_model = torch.load('glove_model.pth')
glove_model.eval()

Glove(
  (center_embedding): Embedding(14395, 2)
  (outside_embedding): Embedding(14395, 2)
  (center_bias): Embedding(14395, 1)
  (outside_bias): Embedding(14395, 1)
)

In [84]:
# calculate cosine similarity of 2 words
glove_similarity = []
for line in test_data:
    if line[1] in vocabs and line[2] in vocabs:
        v = cos_sim(get_embed(glove_model,line[1]), get_embed(glove_model,line[2]))
        glove_similarity.append(v)
    else:
        glove_similarity.append(0)

# glove_similarity

In [85]:
#print correlation value
c3 = spearmanr(human_judgment, glove_similarity)
c3.statistic

-0.0016354367549573797

### GloVe (Gensim)

In [86]:
gensim_model = KeyedVectors.load_word2vec_format('data/glove.6B.100d.txt', binary=False, no_header=True)

In [87]:
# create list for keeping all vocabs on gensim
gensim_vocabs = []
with open('data/glove.6B.100d.txt',encoding="utf8") as file4:
    for line in file4:
        gensim_vocabs.append(line.split()[0])

In [88]:
gensim_similarity = []
for line in test_data:
    if line[1] in gensim_vocabs and line[2] in gensim_vocabs:
        v = 1-gensim_model.distance(line[1],line[2])
        gensim_similarity.append(v)
    else:
        gensim_similarity.append(0)

# gensim_similarity

In [89]:
# print correlation value
c4 = spearmanr(human_judgment, gensim_similarity)
c4.statistic

0.4276064647844679

# Task3 
demo for web (create function and print 10 words that relate to input's word)

In [96]:
# load saved glove model
model = torch.load('glove_model.pth')
model.eval()

Glove(
  (center_embedding): Embedding(14395, 2)
  (outside_embedding): Embedding(14395, 2)
  (center_bias): Embedding(14395, 1)
  (outside_bias): Embedding(14395, 1)
)

In [97]:
# create function to print 10 most similarity phr

def similarity10(word_input):    
    try:
        # input word need to be 1 word
        if len(word_input.split())==1:
            word_emb=get_embed(model,word_input)
            # add wikipedia data to list
            data=[]
            with open('app/harry-potter.txt') as file:
                for word in file:
                    data += word.split()
            # calculate cosine similarity of input word and wikipedia data
            # add to dict (key=word, value=cosine similarity)
            similarity_dict = {}
            for a in data:
                if a in vocabs:
                    a_emb = get_embed(model,a)
                    value = cos_sim(word_emb,a_emb)
                    similarity_dict[a] = value
                else:
                    continue
            # sort the dict
            similarity_dict_sorted = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
            # print the 10 most passages
            for i in range(10):
                print(f"{i+1}.{similarity_dict_sorted[i][0]} ({similarity_dict_sorted[i][1]})")
        else:
            print("the system can search with 1 word only")
                    
    except:
         print("the word is not in my corpus. Please enter the new word")
    

In [98]:
similarity10('Harry')

1.Harry (1.0)
2.positive (0.9999996483942903)
3.making (0.9984447307009964)
4.other (0.9976278348341087)
5.body (0.9970620531315026)
6.among (0.9956205087068863)
7.available (0.9888220151102337)
8.of (0.9855963510450311)
9.As (0.978467599111954)
10.26 (0.9782186552403997)
