## Glove Assignment

In [104]:
import numpy as np 
import torch
import torch.nn as nn
import torch.optim as optim 
import pandas as pd 
import matplotlib as plt
import nltk

### 1. Tokenization

In [105]:
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

In [106]:
from nltk.corpus import brown
nltk.download('brown')
corpus_sent = nltk.corpus.brown.sents(categories=['government'])

[nltk_data] Error loading brown: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [107]:
corpus = [[word.lower() for word in sent] for sent in corpus_sent] #
print(corpus[0:5])

[['the', 'office', 'of', 'business', 'economics', '(', 'obe', ')', 'of', 'the', 'u.s.', 'department', 'of', 'commerce', 'provides', 'basic', 'measures', 'of', 'the', 'national', 'economy', 'and', 'current', 'analysis', 'of', 'short-run', 'changes', 'in', 'the', 'economic', 'situation', 'and', 'business', 'outlook', '.'], ['it', 'develops', 'and', 'analyzes', 'the', 'national', 'income', ',', 'balance', 'of', 'international', 'payments', ',', 'and', 'many', 'other', 'business', 'indicators', '.'], ['such', 'measures', 'are', 'essential', 'to', 'its', 'job', 'of', 'presenting', 'business', 'and', 'government', 'with', 'the', 'facts', 'required', 'to', 'meet', 'the', 'objective', 'of', 'expanding', 'business', 'and', 'improving', 'the', 'operation', 'of', 'the', 'economy', '.'], ['contact'], ['for', 'further', 'information', 'contact', 'director', ',', 'office', 'of', 'business', 'economics', ',', 'u.s.', 'department', 'of', 'commerce', ',', 'washington', '25', ',', 'd.c.', '.']]


In [108]:
len(corpus) #total sentences in corpus

3032

In [109]:
#getting the unique words

flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab[0:5]

['appeal', 'capacitance', 'aeronautics', 'subsequently', 'heat']

## 2. Normalization

In [110]:

# assigning id to the vocabs

word2index = dict()
word2index.update({"<UNK>":  0})

for idx, v in enumerate(vocab):
        word2index.update({v:  idx + 1})

#adding <UNK> 
vocab.append('<UNK>') 

print(word2index['car'])

index2word = {v:k for k, v in word2index.items()}

print(index2word[word2index['car']])


4129
car


## 3. Model

In [111]:
#co-occurence matrix

from collections import Counter
X_i = Counter(flatten(corpus)) # X_i
print(X_i['car'])

3


In [112]:
#defining skipgram function with window size = 1

def skip_grams_generated(window_size=1):

    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1): # for changing the window size
            target = sent[i]
            
            context = list()
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(sent[i - (j + 1)])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(sent[i + (j + 1)])

            for w in context:
                skip_grams.append((target, w)) 
    
    return skip_grams

In [113]:
X_ik_skipgram = Counter(skip_grams_generated(window_size=2))

X_ik_skipgram

Counter({('office', 'the'): 14,
         ('office', 'of'): 14,
         ('office', 'business'): 2,
         ('of', 'office'): 14,
         ('of', 'business'): 26,
         ('of', 'the'): 1900,
         ('of', 'economics'): 2,
         ('business', 'of'): 26,
         ('business', 'economics'): 2,
         ('business', 'office'): 2,
         ('business', '('): 2,
         ('economics', 'business'): 2,
         ('economics', '('): 1,
         ('economics', 'of'): 2,
         ('economics', 'obe'): 1,
         ('(', 'economics'): 1,
         ('(', 'obe'): 1,
         ('(', 'business'): 2,
         ('(', ')'): 141,
         ('obe', '('): 1,
         ('obe', ')'): 1,
         ('obe', 'economics'): 1,
         ('obe', 'of'): 1,
         (')', 'obe'): 1,
         (')', 'of'): 32,
         (')', '('): 129,
         (')', 'the'): 30,
         ('of', ')'): 33,
         ('of', 'obe'): 1,
         ('of', 'u.s.'): 10,
         ('the', 'of'): 1804,
         ('the', 'u.s.'): 9,
         ('the', ')'): 

In [114]:
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1, basically smoothing technique
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75 
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

In [115]:
from itertools import combinations_with_replacement
from tqdm import tqdm

X_ik = {}  #for keeping the co-occurences
weighting_dic = {} #scaling the percentage of sampling
# Use tqdm as amanda recommend!
for bigram in tqdm(combinations_with_replacement(vocab, 2)):
    if X_ik_skipgram.get(bigram) is not None:  #matches 
        co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
        X_ik[bigram] = co_occer + 1 # + 1 for stability issue
        X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

# Do not print if you have large data, otherwise your pc will froze.
#print(f"{X_ik=}")
#print(f"{weighting_dic=}")

27103203it [01:08, 396371.48it/s]


In [116]:
corpus[0:5] #train data

[['the',
  'office',
  'of',
  'business',
  'economics',
  '(',
  'obe',
  ')',
  'of',
  'the',
  'u.s.',
  'department',
  'of',
  'commerce',
  'provides',
  'basic',
  'measures',
  'of',
  'the',
  'national',
  'economy',
  'and',
  'current',
  'analysis',
  'of',
  'short-run',
  'changes',
  'in',
  'the',
  'economic',
  'situation',
  'and',
  'business',
  'outlook',
  '.'],
 ['it',
  'develops',
  'and',
  'analyzes',
  'the',
  'national',
  'income',
  ',',
  'balance',
  'of',
  'international',
  'payments',
  ',',
  'and',
  'many',
  'other',
  'business',
  'indicators',
  '.'],
 ['such',
  'measures',
  'are',
  'essential',
  'to',
  'its',
  'job',
  'of',
  'presenting',
  'business',
  'and',
  'government',
  'with',
  'the',
  'facts',
  'required',
  'to',
  'meet',
  'the',
  'objective',
  'of',
  'expanding',
  'business',
  'and',
  'improving',
  'the',
  'operation',
  'of',
  'the',
  'economy',
  '.'],
 ['contact'],
 ['for',
  'further',
  'informat

#### GloVe

In [117]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [118]:
#testing the method
batch_size = 2 # mini-batch size
skip_grams = skip_grams_generated(window_size=2)
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)


Input:  [[7146]
 [4946]]
Target:  [[ 343]
 [6757]]
Cooc:  [[0.69314718]
 [0.69314718]]
Weighting:  [[0.05318296]
 [0.05318296]]


In [119]:

class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [120]:
# training the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [121]:
# Declare parameters
voc_size = len(vocab)
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = GloVe(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [122]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [123]:
import time
start_training = time.time()
# Training
num_epochs = 500
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

end_training = time.time()
start_min, end_min = epoch_time(start_training, end_training)
print(f'Total time: {start_min}m {end_min}s")')

Epoch: 100 | cost: 93.132736 | time: 0m 0s
Epoch: 200 | cost: 121.045647 | time: 0m 0s
Epoch: 300 | cost: 77.716263 | time: 0m 0s
Epoch: 400 | cost: 197.487549 | time: 0m 0s
Epoch: 500 | cost: 465.276184 | time: 0m 0s
Total time: 0m 42s")


In [124]:
import os

path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/GloVe.pth'
torch.save(model.state_dict(), path)

In [125]:
# import os
 
# # get current directory
# path = os.getcwd()
# print("Current Directory", path)

# print(os.path.abspath(os.path.join(path, os.pardir)))

Current Directory /Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe
/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP


#### Skipgram


In [126]:
def random_batch(batch_size, word_sequence, window_size=1):

# I fix a little from Chaky so we can modify the window_size
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1): # So we can modify the window size
            target = word2index[sent[i]]
            
            context = list()
            # ['a', 'b', 'c', 'd', 'e'] if window size = 2 and target is c
            # this is basically append 'b', 'd', 'a', 'e' into context
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(word2index[sent[i + (j + 1)]])
            
            #context = [word2index[sent[i - 1]], word2index[sent[i + 1]]]
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

In [127]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus, 2)

print("Input: ", input_batch)
print("Target: ", target_batch)
#we will convert them to tensor during training, so don't worry...

Input:  [[ 885]
 [1090]]
Target:  [[6630]
 [1090]]


In [128]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [129]:
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = Skipgram(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [130]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
print(all_vocabs.shape)
all_vocabs = all_vocabs.to(device)

torch.Size([10, 7362])


In [131]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [132]:
import time

# Training
start_train_time = time.time()
num_epochs = 500 # At first I intend to use 5,000 but it's too much for my PC
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch(batch_size, corpus, window_size=2)
    input_batch  = torch.LongTensor(input_batch).to(device)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        start = time.time()
end_train_time = time.time()
train_time_mins, train_time_secs = epoch_time(start_train_time, end_train_time)
print(f'Total time use in skipgram with window size of 2 {train_time_mins} miniute(s) {train_time_secs} second')

Epoch: 100 | cost: 36.343662 | time: 0m 27s
Epoch: 200 | cost: 33.256165 | time: 0m 26s
Epoch: 300 | cost: 23.621540 | time: 0m 27s
Epoch: 400 | cost: 35.052223 | time: 0m 27s
Epoch: 500 | cost: 39.768555 | time: 0m 28s
Total time use in skipgram with window size of 2 2 miniute(s) 17 second


In [133]:
# Save model
path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/Skipgram.pth'
torch.save(model.state_dict(), path)

#### CBOW

In [134]:
# Random batch for cbow

def random_batch_cbow(batch_size, word_sequence, window_size=1):

    cbow = []

    for sent in corpus:
        for i in range(1, len(sent) - 1): # So we can modify the window size
            target = word2index[sent[i]]
            context = list()
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(word2index[sent[i + (j + 1)]])
            
            # This part is different from skipgram
            # Now we use all context as input and target as label
            for w in context:
                cbow.append([context, target])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(cbow)), batch_size, replace=False) #randomly pick without replacement
    
    for i in random_index:
        random_inputs.append(cbow[i][0])  # Context word that we want as input
        random_labels.append([cbow[i][1]])  # Target word that we want as label
    
    return np.array(random_inputs), np.array(random_labels)

In [135]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch_cbow(batch_size, corpus, 2)

print("Input: ", input_batch)
print("Target: ", target_batch)
#we will convert them to tensor during training, so don't worry...

Input:  [[1090 2300 4845 5052]
 [5341 5530 6484 3484]]
Target:  [[518]
 [732]]


In [136]:
class Cbow(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Cbow,self).__init__() # Not sure why we super(Cbow) or super(Skipgram)?
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [137]:
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = Cbow(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [138]:
# Training
import time
num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_cbow(batch_size, corpus, 1)
    input_batch  = torch.LongTensor(input_batch).to(device)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        start = time.time()

Epoch: 100 | cost: 34.118095 | time: 0m 20s
Epoch: 200 | cost: 38.658112 | time: 0m 19s
Epoch: 300 | cost: 29.663013 | time: 0m 19s
Epoch: 400 | cost: 32.212715 | time: 0m 20s
Epoch: 500 | cost: 29.172155 | time: 0m 19s


In [139]:
# Save model
path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/CBOW.pth'
torch.save(model.state_dict(), path)

#### Skipgram with negative sampling

In [140]:
from collections import Counter

word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items()])

# Check if the counting work
word_count['car'], num_total_words

(3, 70117)

In [141]:
# Create unigram table
Z = 0.001
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [142]:
Counter(unigram_table)

Counter({'appeal': 1,
         'required': 3,
         'agreed': 1,
         'includes': 1,
         'rates': 1,
         'international': 2,
         'city': 1,
         'sources': 1,
         'did': 2,
         'few': 2,
         'place': 2,
         'trails': 1,
         'first': 5,
         '1': 7,
         'nighttime': 1,
         'corporation': 1,
         'described': 1,
         'detail': 1,
         '8': 1,
         'christiana': 1,
         'continue': 1,
         'pool': 2,
         'day': 4,
         'planned': 1,
         'week': 2,
         'payments': 2,
         'period': 4,
         'private': 1,
         'supply': 1,
         'outstanding': 1,
         'question': 1,
         'current': 2,
         '1960': 5,
         'who': 6,
         'available': 4,
         'products': 2,
         'aid': 3,
         'manager': 1,
         'run': 1,
         'working': 1,
         'secretary': 4,
         'home': 2,
         'either': 1,
         'allocation': 1,
         'cooperat

In [143]:
import random

# Does the same thing as above.
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

# Pick values from the table that we create before.
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

In [144]:
# Testing
num_neg = 3
negative_sampling(target_batch, unigram_table, num_neg)

#{'grapes': 0, 'apple': 1, 'animal': 2, 'cat': 3, 'ice': 4, 'orange': 5, 'dog': 6, 'monkey': 7, 'conda': 8, 'fruit': 9, 'banana': 10}

tensor([[2360, 2632, 5264],
        [1588, 1038, 3646],
        [6830, 3346, 4873],
        [1961, 1145, 5341],
        [ 302, 2632,  656],
        [7146, 4770, 1684],
        [7235, 3688,    1],
        [5372, 4298,  213],
        [ 960, 6166, 1213],
        [ 190, 6106,  518]])

In [145]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(neg_embeds.size(0), -1) # BxK -> Bx1
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1] = [batch_size, k] ==sum==> [batch_size, 1]
        
        # This is what had been changed from the normal one.
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

In [146]:
# Initialize parameter
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = SkipgramNegSampling(voc_size, embedding_size)
num_neg        = 10 # num of negative sampling

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [147]:
import time

start_train_time = time.time()

# Training
num_epochs = 500
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus, 2)

    # Neat trick to avoid nd.array object (This is bad practice!)
    input_batch = list(input_batch)

    # Padding since we do not cut the sentence so, It will not be in the same shape sometimes.
    lenght_batch0 = len(input_batch[0])
    lenght_batch1 = len(input_batch[1])
    pad_num = np.abs(lenght_batch0 - lenght_batch1)

    # pad the zero dimension
    if lenght_batch0 < lenght_batch1:
        input_batch[0].extend(list(np.full((pad_num, ), 0))) # Padding with zero
    # pad the first dimension
    elif lenght_batch0 > lenght_batch1:
        input_batch[1].extend(list(np.full((pad_num, ), 0)))

    
    #input_batch: [batch_size, 1]
    input_batch = torch.LongTensor(input_batch)
    
    #target_batch: [batch_size, 1]
    target_batch = torch.LongTensor(target_batch)
    
    #negs_batch:   [batch_size, num_neg]
    negs_batch = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()
        
    loss = model(input_batch, target_batch, negs_batch)
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

end_train_time = time.time()
neg_train_time_mins, neg_train_time_secs = epoch_time(start_train_time, end_train_time)
print(f'Total time use in negative sampling {neg_train_time_mins} miniute(s) {neg_train_time_secs} second')

Epoch: 100 | cost: 12.248115 | time: 0m 0s
Epoch: 200 | cost: 16.505119 | time: 0m 0s
Epoch: 300 | cost: 11.288136 | time: 0m 0s
Epoch: 400 | cost: 21.915285 | time: 0m 0s
Epoch: 500 | cost: 14.339628 | time: 0m 0s
Total time use in negative sampling 2 miniute(s) 8 second


In [148]:
# Save model
path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/Neg_Skipgram.pth'
torch.save(model.state_dict(), path)

In [149]:
# Load the data
def read_data(path):
    file = open(path, 'r') # Dataset from amamda
    contents = file.read()
    contents = contents.split('\n') # Seperate chunk of text into substring
    file.close()
    return contents

path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/questions-words.txt'
text = read_data(path)
print(text[0:3])

[': capital-common-countries', 'Athens Greece Baghdad Iraq', 'Athens Greece Bangkok Thailand']


In [150]:
# Find the seperator name and index
seperator = [(idx, sent) for idx, sent in enumerate(text) if sent[0] == ':']
seperator 

[(0, ': capital-common-countries'),
 (507, ': capital-world'),
 (5032, ': currency'),
 (5899, ': city-in-state'),
 (8367, ': family'),
 (8874, ': gram1-adjective-to-adverb'),
 (9867, ': gram2-opposite'),
 (10680, ': gram3-comparative'),
 (12013, ': gram4-superlative'),
 (13136, ': gram5-present-participle'),
 (14193, ': gram6-nationality-adjective'),
 (15793, ': gram7-past-tense'),
 (17354, ': gram8-plural'),
 (18687, ': gram9-plural-verbs')]

In [151]:
# Let's use opposite and plural
opposite = text[9868:10680]
plural = text[17355:18687]

# Concatenate
test_text = opposite + plural

# Checking
print(test_text[0])
print(test_text[-1])

acceptable unacceptable aware unaware
woman women snake snakes


In [152]:
test_opposite = [sent.split(" ") for sent in opposite]
test_plural = [sent.split(" ") for sent in plural]
test_corpus = [sent.split(" ") for sent in test_text]
print(test_corpus[0:5])

[['acceptable', 'unacceptable', 'aware', 'unaware'], ['acceptable', 'unacceptable', 'certain', 'uncertain'], ['acceptable', 'unacceptable', 'clear', 'unclear'], ['acceptable', 'unacceptable', 'comfortable', 'uncomfortable'], ['acceptable', 'unacceptable', 'competitive', 'uncompetitive']]


In [153]:
# Flatten and get Unique words
flatten = lambda l: [item for sublist in l for item in sublist]
test_vocab = list(set(flatten(test_corpus)))
test_vocab[0:5]

['impossibly', 'computer', 'elephants', 'pears', 'irrational']

In [154]:
# Word2index and Index2word for test set
# Word2index and Index2word

# assign id to those vocabs
test_word2index = dict()
test_word2index.update({"<UNK>":  0})
for idx, v in enumerate(test_vocab):
        test_word2index.update({v:  idx + 1})

#add <UNK>, which is a very normal token exists in the world
test_vocab.append('<UNK>') #chaky, can it be ##UNK, or UNKKKKKK, or anything

# Testing
print(test_word2index['car'])

# index2word
test_index2word = {v:k for k, v in test_word2index.items()}

print(test_index2word[test_word2index['car']])

63
car


In [155]:
# Function to get embedding
def get_embed(word, current_model):
    try:
        index = word2index[word]
    except :
        index = word2index['<UNK>'] #unknown
    word = torch.LongTensor([index])
    
    embed =  (current_model.embedding_v(word)+current_model.embedding_u(word))/2
    return np.array(embed[0].detach().numpy())

In [156]:
# We will put it in a loop soon!
models_weight_list = ['GloVe', 'CBOW', 'Skipgram', 'Neg_Skipgram']
model_list = [GloVe(voc_size, embedding_size), 
              GloVe(voc_size, embedding_size), 
              Cbow(voc_size, embedding_size), 
              Skipgram(voc_size, embedding_size), 
              SkipgramNegSampling(voc_size, embedding_size)]

In [157]:
# Test embeded
testing_word = 'Queen'
current_model = model_list[0]

weight_path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/GloVe.pth'

current_weight = models_weight_list[0]
current_model.load_state_dict(torch.load(weight_path))
current_model.eval()

test_embed = get_embed(testing_word, current_model)
test_embed

array([ 0.7981735 ,  0.3838179 , -0.36671826, -0.48267865,  0.26179492,
       -0.3489858 , -0.08714995, -0.38581833,  1.6609585 , -0.5792571 ,
       -0.52467054,  0.2776766 ,  0.47701734, -0.3608383 , -0.3687769 ,
        0.32260102, -0.47021452,  1.5255069 , -0.43845403, -0.6355107 ,
        0.99704975, -0.5858865 , -0.8325716 ,  0.14871874, -0.58258206,
        0.9743253 ,  0.38821566,  0.7867026 , -0.538668  , -0.81857187,
        0.16740286, -0.6317246 , -0.21795206, -0.07072954, -1.1923411 ,
        1.0391904 , -0.65810436, -0.8414976 , -1.2894249 ,  0.59089243,
        0.40816256, -0.02337929, -0.8058307 ,  0.5699825 , -0.25628445,
        0.26372465,  0.7058599 ,  1.1191238 , -0.1993269 , -0.7629969 ,
       -0.3672024 , -0.76915467, -0.30962247, -0.02698094,  0.15500481,
        0.7073642 , -0.7914547 ,  0.38551423, -0.971647  ,  0.29007387,
        0.06968477, -0.81958306,  1.0259353 , -0.7165971 ,  0.33586895,
       -0.52778935, -0.36265248, -0.7546539 , -0.694041  , -0.24

In [158]:
#numpy version
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [159]:
def find_analogy(a,b,c,vocabs=vocab):
    emb_a, emb_b, emb_c = get_embed(a, current_model), get_embed(b, current_model), get_embed(c, current_model)
    vector = emb_b - emb_a + emb_c
    # vector_norm = (vector ** 2).sum() ** (1 / 2)
    # vector = vector / vector_norm
    # print(vector.shape)
    similarity = -1 
    
    for vocab in vocabs:
        if vocab not in [a,b,c]: #ignore input words itself
            current_sim = cos_sim(vector,get_embed(vocab, current_model))
            if current_sim > similarity:
                similarity = current_sim #update better one
                d = (vocab, similarity)
    return d

In [160]:
# Testing find_analogy functions
find_analogy('man', 'woman', 'adult')

('specifications', 0.37618497)

In [161]:
find_analogy('man', 'woman', 'adult')[0]

'specifications'

In [162]:
#### Semantic testing

In [163]:
# We will put it in a loop soon!
models_weight_list = ['GloVe', 'CBOW', 'Skipgram', 'Neg_Skipgram']
models_name = ['Glove', 'Cbow', 'Skipgram', 'Neg_Skipgram']
voc_size = len(vocab)
embedding_size = 100
model_list = [GloVe(voc_size, embedding_size), 
              GloVe(voc_size, embedding_size), 
              Cbow(voc_size, embedding_size), 
              Skipgram(voc_size, embedding_size), 
              SkipgramNegSampling(voc_size, embedding_size)]

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
test_list = [test_opposite, test_plural]
test_list_name = ['test_opposite', 'test_plural']

In [164]:
def check_accruacy(y, yhat):
    if y == yhat:
        return True
    else:
        return False

def test_accruacy_batch(data, current_model):
    counter = 0
    for sent in data:
        label = sent[-1]
        a, b, c = sent[:-1]
        yhat = find_analogy(a, b, c)[0] # It's return in tuple form, so we need to slice to get word
        if check_accruacy(label, yhat) == True:
            counter = counter + 1
    
    return counter
        

In [169]:

main_results = list()
main_accruacy = list()
main_results_name = list()
results = list()
accruacy = list()
results_name = list()

for models_idx in range(len(models_weight_list)):
    weight_path = '/Users/sapnathapa/Documents/AIT/Spring Sem 2023/NLP/GloVe/' + models_weight_list[models_idx] + '.pth'
    current_model = model_list[models_idx]
    current_model.load_state_dict(torch.load(weight_path))
    current_model.eval()
    print(f'Current model = {models_name[models_idx]}')
    print(f'Current weight = {models_weight_list[models_idx]}')
    
    for idx, current_test in enumerate(test_list):
        sample_list = random.choices(current_test, k=100)
        print(f'Current_test = {test_list_name[idx]}')
        accruacy = test_accruacy_batch(sample_list, current_model)
        print(accruacy)

Current model = Glove
Current weight = GloVe
Current_test = test_opposite
0
Current_test = test_plural
0


RuntimeError: Error(s) in loading state_dict for GloVe:
	Missing key(s) in state_dict: "v_bias.weight", "u_bias.weight". 