__Name:__ Tonson Praphabkul  
__Student_Id:__ st123010

In [239]:
# Import necessary libraries.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
import nltk

### ETL/Tokenize

In [240]:
# Use corpus from nltk
# Amamda recommend this!
from nltk.corpus import brown
nltk.download('brown')
corpus_sentence = nltk.corpus.brown.sents(categories=['government'])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [241]:
corpus = [[word.lower() for word in sent] for sent in corpus_sentence] # Cool list comprehension trick!
print(corpus[0:5])

[['the', 'office', 'of', 'business', 'economics', '(', 'obe', ')', 'of', 'the', 'u.s.', 'department', 'of', 'commerce', 'provides', 'basic', 'measures', 'of', 'the', 'national', 'economy', 'and', 'current', 'analysis', 'of', 'short-run', 'changes', 'in', 'the', 'economic', 'situation', 'and', 'business', 'outlook', '.'], ['it', 'develops', 'and', 'analyzes', 'the', 'national', 'income', ',', 'balance', 'of', 'international', 'payments', ',', 'and', 'many', 'other', 'business', 'indicators', '.'], ['such', 'measures', 'are', 'essential', 'to', 'its', 'job', 'of', 'presenting', 'business', 'and', 'government', 'with', 'the', 'facts', 'required', 'to', 'meet', 'the', 'objective', 'of', 'expanding', 'business', 'and', 'improving', 'the', 'operation', 'of', 'the', 'economy', '.'], ['contact'], ['for', 'further', 'information', 'contact', 'director', ',', 'office', 'of', 'business', 'economics', ',', 'u.s.', 'department', 'of', 'commerce', ',', 'washington', '25', ',', 'd.c.', '.']]


In [242]:
# Check the sentences in corpus
len(corpus)

3032

In [243]:
# Flatten and get Unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab[0:5]

['localities', 'ward', 'honesty', 'accomplishing', 'charles']

### Numericalize

In [244]:
# Word2index and Index2word

# assign id to those vocabs
word2index = dict()
word2index.update({"<UNK>":  0})
for idx, v in enumerate(vocab):
        word2index.update({v:  idx + 1})

#add <UNK>, which is a very normal token exists in the world
vocab.append('<UNK>') #chaky, can it be ##UNK, or UNKKKKKK, or anything

# Testing
print(word2index['car'])

# index2word
index2word = {v:k for k, v in word2index.items()}

print(index2word[word2index['car']])


739
car


# Glove

In [245]:
# Build co-occurance matrix
from collections import Counter
X_i = Counter(flatten(corpus)) # X_i
print(X_i['car'])

3


In [246]:
# Use this function to create skipgram, basically fix a bit from
# generated batch function
def skip_grams_generated(window_size=1):

# I fix a little from Chaky so we can modify the window_size
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1): # So we can modify the window size
            target = sent[i]
            
            context = list()
            # ['a', 'b', 'c', 'd', 'e'] if window size = 2 and target is c
            # this is basically append 'b', 'd', 'a', 'e' into context
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(sent[i - (j + 1)])
                    #context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(sent[i + (j + 1)])

            for w in context:
                skip_grams.append((target, w)) # Return tuple instead of list because we want to use count
                                                # function later
    
    return skip_grams

In [247]:
# Find co-occurance in skip_grams with window of 2
X_ik_skipgram = Counter(skip_grams_generated(window_size=2))

X_ik_skipgram

Counter({('office', 'the'): 14,
         ('office', 'of'): 14,
         ('office', 'business'): 2,
         ('of', 'office'): 14,
         ('of', 'business'): 26,
         ('of', 'the'): 1900,
         ('of', 'economics'): 2,
         ('business', 'of'): 26,
         ('business', 'economics'): 2,
         ('business', 'office'): 2,
         ('business', '('): 2,
         ('economics', 'business'): 2,
         ('economics', '('): 1,
         ('economics', 'of'): 2,
         ('economics', 'obe'): 1,
         ('(', 'economics'): 1,
         ('(', 'obe'): 1,
         ('(', 'business'): 2,
         ('(', ')'): 141,
         ('obe', '('): 1,
         ('obe', ')'): 1,
         ('obe', 'economics'): 1,
         ('obe', 'of'): 1,
         (')', 'obe'): 1,
         (')', 'of'): 32,
         (')', '('): 129,
         (')', 'the'): 30,
         ('of', ')'): 33,
         ('of', 'obe'): 1,
         ('of', 'u.s.'): 10,
         ('the', 'of'): 1804,
         ('the', 'u.s.'): 9,
         ('the', ')'): 

In [248]:
# Weight function

#simply a normalized function...don't worry too much
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1, basically smoothing technique
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75 # Followed Chaky way!
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

In [249]:
from itertools import combinations_with_replacement
from tqdm import tqdm

X_ik = {}  #for keeping the co-occurences
weighting_dic = {} #scaling the percentage of sampling
# Use tqdm as amanda recommend!
for bigram in tqdm(combinations_with_replacement(vocab, 2)):
    if X_ik_skipgram.get(bigram) is not None:  #matches 
        co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
        X_ik[bigram] = co_occer + 1 # + 1 for stability issue
        X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

# Do not print if you have large data, otherwise your pc will froze.
#print(f"{X_ik=}")
#print(f"{weighting_dic=}")

KeyboardInterrupt: 

### Prepare train data

In [None]:
corpus[0:5]

[['the',
  'office',
  'of',
  'business',
  'economics',
  '(',
  'obe',
  ')',
  'of',
  'the',
  'u.s.',
  'department',
  'of',
  'commerce',
  'provides',
  'basic',
  'measures',
  'of',
  'the',
  'national',
  'economy',
  'and',
  'current',
  'analysis',
  'of',
  'short-run',
  'changes',
  'in',
  'the',
  'economic',
  'situation',
  'and',
  'business',
  'outlook',
  '.'],
 ['it',
  'develops',
  'and',
  'analyzes',
  'the',
  'national',
  'income',
  ',',
  'balance',
  'of',
  'international',
  'payments',
  ',',
  'and',
  'many',
  'other',
  'business',
  'indicators',
  '.'],
 ['such',
  'measures',
  'are',
  'essential',
  'to',
  'its',
  'job',
  'of',
  'presenting',
  'business',
  'and',
  'government',
  'with',
  'the',
  'facts',
  'required',
  'to',
  'meet',
  'the',
  'objective',
  'of',
  'expanding',
  'business',
  'and',
  'improving',
  'the',
  'operation',
  'of',
  'the',
  'economy',
  '.'],
 ['contact'],
 ['for',
  'further',
  'informat

In [None]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [None]:
#testing the method
batch_size = 2 # mini-batch size
skip_grams = skip_grams_generated(window_size=2)
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[7220]
 [6415]]
Target:  [[3379]
 [2377]]
Cooc:  [[0.69314718]
 [1.60943791]]
Weighting:  [[0.05318296]
 [0.10573713]]


In [None]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

### Glove Training

In [None]:
# We are nvidia fanboys, so CUDA!!!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# Declare parameters
voc_size = len(vocab)
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = GloVe(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time
start_training = time.time()
# Training
num_epochs = 500
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

end_training = time.time()
start_min, end_min = epoch_time(start_training, end_training)
print(f'Total time: {start_min}m {end_min}s")')


Epoch: 100 | cost: 245.649582 | time: 0m 0s
Epoch: 200 | cost: 332.947571 | time: 0m 0s
Epoch: 300 | cost: 85.693680 | time: 0m 0s
Epoch: 400 | cost: 132.173416 | time: 0m 0s
Epoch: 500 | cost: 77.435997 | time: 0m 0s
Total time: 3m 57s")


In [None]:
# Save model
path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/Glove_500.pth'
torch.save(model.state_dict(), path)

### Skipgrams

In [None]:
def random_batch(batch_size, word_sequence, window_size=1):

# I fix a little from Chaky so we can modify the window_size
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1): # So we can modify the window size
            target = word2index[sent[i]]
            
            context = list()
            # ['a', 'b', 'c', 'd', 'e'] if window size = 2 and target is c
            # this is basically append 'b', 'd', 'a', 'e' into context
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(word2index[sent[i + (j + 1)]])
            
            #context = [word2index[sent[i - 1]], word2index[sent[i + 1]]]
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

In [None]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus, 2)

print("Input: ", input_batch)
print("Target: ", target_batch)
#we will convert them to tensor during training, so don't worry...

Input:  [[4243]
 [6386]]
Target:  [[4143]
 [6137]]


In [None]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [None]:
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = Skipgram(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
print(all_vocabs.shape)
all_vocabs = all_vocabs.to(device)

torch.Size([10, 7362])


In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

# Training
start_train_time = time.time()
num_epochs = 500 # At first I intend to use 5,000 but it's too much for my PC
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch(batch_size, corpus, window_size=2)
    input_batch  = torch.LongTensor(input_batch).to(device)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        start = time.time()
end_train_time = time.time()
train_time_mins, train_time_secs = epoch_time(start_train_time, end_train_time)
print(f'Total time use in skipgram with window size of 2 {train_time_mins} miniute(s) {train_time_secs} second')

Epoch: 100 | cost: 28.747135 | time: 0m 52s
Epoch: 200 | cost: 36.921993 | time: 0m 43s
Epoch: 300 | cost: 39.399998 | time: 0m 44s
Epoch: 400 | cost: 37.276257 | time: 0m 43s
Epoch: 500 | cost: 33.657913 | time: 0m 45s
Total time use in skipgram with window size of 2 3 miniute(s) 49 second


In [None]:
# Save model
path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/Skipgrams_500.pth'
torch.save(model.state_dict(), path)

### CBOW

In [None]:
# Random batch for cbow

def random_batch_cbow(batch_size, word_sequence, window_size=1):

    cbow = []

    for sent in corpus:
        for i in range(1, len(sent) - 1): # So we can modify the window size
            target = word2index[sent[i]]
            context = list()
            
            for j in range(window_size):
                
                if i - (j + 1) >= 0: # Check if it outside of range from the left of list
                    context.append(word2index[sent[i - (j + 1)]])
                
                if i + (j + 1) < len(sent): # Check if it outside of range from the right of list
                    context.append(word2index[sent[i + (j + 1)]])
            
            # This part is different from skipgram
            # Now we use all context as input and target as label
            for w in context:
                cbow.append([context, target])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(cbow)), batch_size, replace=False) #randomly pick without replacement
    
    for i in random_index:
        random_inputs.append(cbow[i][0])  # Context word that we want as input
        random_labels.append([cbow[i][1]])  # Target word that we want as label
    
    return np.array(random_inputs), np.array(random_labels)

In [None]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch_cbow(batch_size, corpus, 2)

print("Input: ", input_batch)
print("Target: ", target_batch)
#we will convert them to tensor during training, so don't worry...

Input:  [[3228 5876 2003 2377]
 [6398 6415 7069 5321]]
Target:  [[4700]
 [7106]]


In [None]:
class Cbow(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Cbow,self).__init__() # Not sure why we super(Cbow) or super(Skipgram)?
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [None]:
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = Cbow(voc_size, embedding_size)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training
import time
num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_cbow(batch_size, corpus, 1)
    input_batch  = torch.LongTensor(input_batch).to(device)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        start = time.time()


Epoch: 100 | cost: 38.277550 | time: 0m 35s
Epoch: 200 | cost: 34.378628 | time: 0m 35s
Epoch: 300 | cost: 30.358282 | time: 0m 39s
Epoch: 400 | cost: 28.354191 | time: 0m 37s
Epoch: 500 | cost: 30.310083 | time: 0m 41s


In [None]:
# Save model
path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/CBow_500.pth'
torch.save(model.state_dict(), path)

### Skipgram with negative sampling

In [None]:
from collections import Counter

word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items()])

# Check if the counting work
word_count['car'], num_total_words

(3, 70117)

In [None]:
# Create unigram table
Z = 0.001
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [None]:
Counter(unigram_table)

Counter({'total': 3,
         'records': 1,
         'served': 1,
         'example': 1,
         'necessary': 4,
         'considerable': 1,
         'naval': 1,
         'forth': 1,
         'week': 2,
         'shipments': 1,
         'continuing': 1,
         'proposed': 2,
         'yarn': 1,
         'final': 1,
         'today': 2,
         'approved': 1,
         'federal': 4,
         'approach': 1,
         'come': 1,
         'd.c.': 1,
         'serious': 1,
         'york': 3,
         'sales': 3,
         'officer': 2,
         'persons': 1,
         'keep': 1,
         'often': 1,
         'may': 11,
         'feet': 1,
         'january': 1,
         'receive': 1,
         'think': 1,
         'movable': 1,
         'values': 1,
         'funds': 3,
         'community': 2,
         'export-import': 1,
         'recognized': 1,
         'concern': 1,
         'john': 2,
         'peace': 4,
         'contributed': 1,
         'small': 5,
         'concerns': 2,
        

In [None]:
import random

# Does the same thing as above.
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

# Pick values from the table that we create before.
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

In [None]:
# Testing
num_neg = 3
negative_sampling(target_batch, unigram_table, num_neg)

#{'grapes': 0, 'apple': 1, 'animal': 2, 'cat': 3, 'ice': 4, 'orange': 5, 'dog': 6, 'monkey': 7, 'conda': 8, 'fruit': 9, 'banana': 10}

tensor([[ 384, 3991, 3490],
        [5876, 4921, 4700],
        [6244, 1422, 7069],
        [5271, 4046, 3943],
        [3877, 6788, 1518],
        [4700, 4281, 2161],
        [2821, 1849, 7335],
        [5811, 4700, 6398],
        [2104,  330,  699],
        [ 177, 5317, 1399]])

In [None]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(neg_embeds.size(0), -1) # BxK -> Bx1
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1] = [batch_size, k] ==sum==> [batch_size, 1]
        
        # This is what had been changed from the normal one.
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

In [None]:
# Initialize parameter
batch_size     = 10 # mini-batch size
embedding_size = 100 #so we can later plot
model          = SkipgramNegSampling(voc_size, embedding_size)
num_neg        = 10 # num of negative sampling

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
import time

start_train_time = time.time()

# Training
num_epochs = 500
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus, 2)

    # Neat trick to avoid nd.array object (This is bad practice!)
    input_batch = list(input_batch)

    # Padding since we do not cut the sentence so, It will not be in the same shape sometimes.
    lenght_batch0 = len(input_batch[0])
    lenght_batch1 = len(input_batch[1])
    pad_num = np.abs(lenght_batch0 - lenght_batch1)

    # pad the zero dimension
    if lenght_batch0 < lenght_batch1:
        input_batch[0].extend(list(np.full((pad_num, ), 0))) # Padding with zero
    # pad the first dimension
    elif lenght_batch0 > lenght_batch1:
        input_batch[1].extend(list(np.full((pad_num, ), 0)))

    
    #input_batch: [batch_size, 1]
    input_batch = torch.LongTensor(input_batch)
    
    #target_batch: [batch_size, 1]
    target_batch = torch.LongTensor(target_batch)
    
    #negs_batch:   [batch_size, num_neg]
    negs_batch = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()
        
    loss = model(input_batch, target_batch, negs_batch)
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

end_train_time = time.time()
neg_train_time_mins, neg_train_time_secs = epoch_time(start_train_time, end_train_time)
print(f'Total time use in negative sampling {neg_train_time_mins} miniute(s) {neg_train_time_secs} second')


Epoch: 100 | cost: 9.755800 | time: 0m 0s
Epoch: 200 | cost: 16.796337 | time: 0m 0s
Epoch: 300 | cost: 14.862808 | time: 0m 0s
Epoch: 400 | cost: 24.771328 | time: 0m 0s
Epoch: 500 | cost: 16.948902 | time: 0m 0s
Total time use in negative sampling 4 miniute(s) 3 second


In [None]:
# Save model
path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/Neg_Skipgrams_500.pth'
torch.save(model.state_dict(), path)

# Testing

In [None]:
# Load the data
def read_data(path):
    file = open(path, 'r') # Dataset from amamda
    contents = file.read()
    contents = contents.split('\n') # Seperate chunk of text into substring
    file.close()
    return contents

path = '/root/projects/NLP/Assignment/19_Jan_Glove/questions-words.txt'
text = read_data(path)
print(text[0:3])

[': capital-common-countries', 'Athens Greece Baghdad Iraq', 'Athens Greece Bangkok Thailand']


In [None]:
# Find the seperator name and index
seperator = [(idx, sent) for idx, sent in enumerate(text) if sent[0] == ':']
seperator 

[(0, ': capital-common-countries'),
 (507, ': capital-world'),
 (5032, ': currency'),
 (5899, ': city-in-state'),
 (8367, ': family'),
 (8874, ': gram1-adjective-to-adverb'),
 (9867, ': gram2-opposite'),
 (10680, ': gram3-comparative'),
 (12013, ': gram4-superlative'),
 (13136, ': gram5-present-participle'),
 (14193, ': gram6-nationality-adjective'),
 (15793, ': gram7-past-tense'),
 (17354, ': gram8-plural'),
 (18687, ': gram9-plural-verbs')]

In [None]:
# Let's use opposite and plural
opposite = text[9868:10680]
plural = text[17355:18687]

# Concatenate
test_text = opposite + plural

# Checking
print(test_text[0])
print(test_text[-1])

acceptable unacceptable aware unaware
woman women snake snakes


In [None]:
test_opposite = [sent.split(" ") for sent in opposite]
test_plural = [sent.split(" ") for sent in plural]
test_corpus = [sent.split(" ") for sent in test_text]
print(test_corpus[0:5])

[['acceptable', 'unacceptable', 'aware', 'unaware'], ['acceptable', 'unacceptable', 'certain', 'uncertain'], ['acceptable', 'unacceptable', 'clear', 'unclear'], ['acceptable', 'unacceptable', 'comfortable', 'uncomfortable'], ['acceptable', 'unacceptable', 'competitive', 'uncompetitive']]


In [None]:
# Flatten and get Unique words
flatten = lambda l: [item for sublist in l for item in sublist]
test_vocab = list(set(flatten(test_corpus)))
test_vocab[0:5]

['melon', 'pears', 'elephants', 'cat', 'dogs']

In [None]:
# Word2index and Index2word for test set
# Word2index and Index2word

# assign id to those vocabs
test_word2index = dict()
test_word2index.update({"<UNK>":  0})
for idx, v in enumerate(test_vocab):
        test_word2index.update({v:  idx + 1})

#add <UNK>, which is a very normal token exists in the world
test_vocab.append('<UNK>') #chaky, can it be ##UNK, or UNKKKKKK, or anything

# Testing
print(test_word2index['car'])

# index2word
test_index2word = {v:k for k, v in test_word2index.items()}

print(test_index2word[test_word2index['car']])



62
car


'''  
batch_size     = 10 # mini-batch size  
embedding_size = 100 #so we can later plot  
model          = GloVe(voc_size, embedding_size)  
model          = Skipgram(voc_size, embedding_size)  
model          = Cbow(voc_size, embedding_size)  
model          = SkipgramNegSampling(voc_size, embedding_size)  
num_neg        = 10 # num of negative sampling  
  
criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)  
'''

In [None]:
# Function to get embedding
def get_embed(word, current_model):
    try:
        index = word2index[word]
    except :
        index = word2index['<UNK>'] #unknown
    word = torch.LongTensor([index])
    
    embed =  (current_model.embedding_v(word)+current_model.embedding_u(word))/2
    return np.array(embed[0].detach().numpy())

In [None]:
# We will put it in a loop soon!
models_weight_list = ['Glove_500', 'Glove_1000', 'Cbow_500', 'Skipgrams_500', 'Neg_Skipgrams_500']
model_list = [GloVe(voc_size, embedding_size), 
              GloVe(voc_size, embedding_size), 
              Cbow(voc_size, embedding_size), 
              Skipgram(voc_size, embedding_size), 
              SkipgramNegSampling(voc_size, embedding_size)]

In [None]:
# Test embeded
testing_word = 'Queen'
current_model = model_list[0]

weight_path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/Glove_500.pth'

current_weight = models_weight_list[0]
current_model.load_state_dict(torch.load(weight_path))
current_model.eval()

test_embed = get_embed(testing_word, current_model)
test_embed

array([-5.31000853e-01,  5.93215227e-02,  6.83412910e-01,  7.73887038e-01,
        7.61745512e-01, -1.96667090e-01, -6.44544661e-01, -7.25385070e-01,
       -1.08609486e+00,  3.33659559e-01, -1.16339207e-01, -3.97296727e-01,
        8.78399163e-02, -5.61352253e-01, -1.10439849e+00,  1.13723803e+00,
       -3.92948955e-01,  8.52313399e-01,  1.18349522e-01,  5.32606244e-02,
        1.05900615e-01, -9.84223962e-01,  3.51580799e-01,  8.27544630e-01,
       -8.03473473e-01, -2.28011340e-01, -4.28224951e-02, -9.74792480e-01,
       -2.17144549e-01,  5.62470794e-01,  6.65428877e-01,  7.59494007e-01,
        3.60655874e-01,  1.39651954e-01,  4.55759913e-01, -3.43697608e-01,
        6.35424137e-01,  6.83626294e-01, -9.65903521e-01, -3.18383455e-01,
        4.19828475e-01, -4.20420974e-01, -1.35199845e+00,  1.29635715e+00,
       -1.89303786e-01, -1.25500464e+00, -1.54869437e-01, -1.63623929e-01,
       -2.98520237e-01, -1.66727638e+00, -8.15379620e-01, -1.83288842e-01,
       -1.97563738e-01, -

In [None]:
#numpy version
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [None]:
def find_analogy(a,b,c,vocabs=vocab):
    emb_a, emb_b, emb_c = get_embed(a, current_model), get_embed(b, current_model), get_embed(c, current_model)
    vector = emb_b - emb_a + emb_c
    # vector_norm = (vector ** 2).sum() ** (1 / 2)
    # vector = vector / vector_norm
    # print(vector.shape)
    similarity = -1 
    
    for vocab in vocabs:
        if vocab not in [a,b,c]: #ignore input words itself
            current_sim = cos_sim(vector,get_embed(vocab, current_model))
            if current_sim > similarity:
                similarity = current_sim #update better one
                d = (vocab, similarity)
    return d

In [None]:
# Testing find_analogy functions
find_analogy('man', 'woman', 'adult')

('signals', 0.34757987)

In [None]:
find_analogy('man', 'woman', 'adult')[0]

'signals'

## Semantic testing

In [None]:
# We will put it in a loop soon!
models_weight_list = ['Glove_500', 'Glove_1000', 'Cbow_500', 'Skipgrams_500', 'Neg_Skipgrams_500']
models_name = ['Glove', 'Glove', 'Cbow', 'Skipgrams', 'Neg_Skipgrams']
voc_size = len(vocab)
embedding_size = 100
model_list = [GloVe(voc_size, embedding_size), 
              GloVe(voc_size, embedding_size), 
              Cbow(voc_size, embedding_size), 
              Skipgram(voc_size, embedding_size), 
              SkipgramNegSampling(voc_size, embedding_size)]

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
test_list = [test_opposite, test_plural]
test_list_name = ['test_opposite', 'test_plural']


In [None]:
def check_accruacy(y, yhat):
    if y == yhat:
        return True
    else:
        return False

def test_accruacy_batch(data, current_model):
    counter = 0
    for sent in data:
        label = sent[-1]
        a, b, c = sent[:-1]
        yhat = find_analogy(a, b, c)[0] # It's return in tuple form, so we need to slice to get word
        if check_accruacy(label, yhat) == True:
            counter = counter + 1
    
    return counter
        

In [None]:
# Have the model to predict the last word.
# Actually, It is better to random the label but mine model is very weak. so I will just predict last word

#models_weight_list = ['Glove_500', 'Glove_1000', 'Cbow_500', 'Skipgrams_500', 'Neg_Skipgrams_500']

main_results = list()
main_accruacy = list()
main_results_name = list()
results = list()
accruacy = list()
results_name = list()

for models_idx in range(len(models_weight_list)):
    weight_path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/' + models_weight_list[models_idx] + '.pth'
    current_model = model_list[models_idx]
    current_model.load_state_dict(torch.load(weight_path))
    current_model.eval()
    print(f'Current model = {models_name[models_idx]}')
    print(f'Current weight = {models_weight_list[models_idx]}')
    
    for idx, current_test in enumerate(test_list):
        sample_list = random.choices(current_test, k=100)
        print(f'Current_test = {test_list_name[idx]}')
        accruacy = test_accruacy_batch(sample_list, current_model)
        print(accruacy)
    

Current model = Glove
Current weight = Glove_500
Current_test = test_opposite
0
Current_test = test_plural
0
Current model = Glove
Current weight = Glove_1000
Current_test = test_opposite
0
Current_test = test_plural
0
Current model = Cbow
Current weight = Cbow_500
Current_test = test_opposite
0
Current_test = test_plural
0
Current model = Skipgrams
Current weight = Skipgrams_500
Current_test = test_opposite
0
Current_test = test_plural
0
Current model = Neg_Skipgrams
Current weight = Neg_Skipgrams_500
Current_test = test_opposite
0
Current_test = test_plural
0


### Gensim

In [253]:
import gensim
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
# Download from https://github.com/stanfordnlp/GloVe
glove_file = datapath('/root/projects/NLP/Assignment/19_Jan_Glove/glove.6B.100d.txt')
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)


In [255]:
#result = model.most_similar(positive=['woman', 'king'], negative=['man'])

for idx, current_test in enumerate(test_list):
    sample_list = random.choices(current_test, k=100)
    print(f'Current_test = {test_list_name[idx]}')

    counter = 0
    for sent in current_test:
        label = sent[-1]
        a, b, c = sent[:-1]
        yhat = model.most_similar(positive=[a, b], negative=[c])
        yhat = yhat[0][0]

        if check_accruacy(label, yhat) == True:
            counter = counter + 1

    print(accruacy)

Current_test = test_opposite
0
Current_test = test_plural
0


In [261]:
# Checking the results
sent, yhat, label

(['woman', 'women', 'snake', 'snakes'], 'men', 'snakes')

### Synthetic testing

In [278]:
# Import datasets
import pandas as pd
path = '/root/projects/NLP/Assignment/19_Jan_Glove/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'
df = pd.read_table(path, header=None)
df.head()

Unnamed: 0,0,1,2
0,tiger,cat,7.35
1,tiger,tiger,10.0
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77


In [281]:
synthetic_test_set = df.iloc[:10]
synthetic_test_set

Unnamed: 0,0,1,2
0,tiger,cat,7.35
1,tiger,tiger,10.0
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
5,media,radio,7.42
6,bread,butter,6.19
7,cucumber,potato,5.92
8,doctor,nurse,7.0
9,professor,doctor,6.62


In [288]:
input_x1 = synthetic_test_set[0]
input_x2 = synthetic_test_set[1]
label = synthetic_test_set[2]

In [273]:
# Function to get embedding
def get_embed(word, current_model):
    try:
        index = word2index[word]
    except :
        index = word2index['<UNK>'] #unknown
    word = torch.LongTensor([index])
    
    embed =  (current_model.embedding_v(word)+current_model.embedding_u(word))/2
    return np.array(embed[0].detach().numpy())

In [None]:
# We will put it in a loop soon!
models_weight_list = ['Glove_500', 'Glove_1000', 'Cbow_500', 'Skipgrams_500', 'Neg_Skipgrams_500']
models_name = ['Glove', 'Glove', 'Cbow', 'Skipgrams', 'Neg_Skipgrams']
voc_size = len(vocab)
embedding_size = 100
model_list = [GloVe(voc_size, embedding_size), 
              GloVe(voc_size, embedding_size), 
              Cbow(voc_size, embedding_size), 
              Skipgram(voc_size, embedding_size), 
              SkipgramNegSampling(voc_size, embedding_size)]

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
test_list = [test_opposite, test_plural]
test_list_name = ['test_opposite', 'test_plural']


In [291]:
# Test
from scipy import stats
current_model = model_list[0]
current_model.eval()

cat = get_embed('cat', current_model)
dog = get_embed('dog', current_model)

stats.spearmanr(cat, dog)[0]

#res = stats.spearmanr([1, 2, 3, 4, 5], [5, 6, 7, 8, 7])

0.034287428742874285

In [292]:
# Have the model to find word similarity.
#models_weight_list = ['Glove_500', 'Glove_1000', 'Cbow_500', 'Skipgrams_500', 'Neg_Skipgrams_500']

main_results = list()
main_results_name = list()
results = list()
results_name = list()

for models_idx in range(len(models_weight_list)):
    weight_path = '/root/projects/NLP/Assignment/19_Jan_Glove/models/' + models_weight_list[models_idx] + '.pth'
    current_model = model_list[models_idx]
    current_model.load_state_dict(torch.load(weight_path))
    current_model.eval()
    print(f'Current model = {models_name[models_idx]}')
    print(f'Current weight = {models_weight_list[models_idx]}')
    
    for idx in range(10): # We test with only fix 10 samples
        emb_x1 = get_embed(input_x1[idx], current_model)
        emb_x2 = get_embed(input_x2[idx], current_model)
        yhat = stats.spearmanr(emb_x1, emb_x2)
        yhat = yhat[0]

        results.append(yhat)

    main_results.append(results)
    main_results_name.append(models_name[models_idx])
    

Current model = Glove
Current weight = Glove_500
Current model = Glove
Current weight = Glove_1000
Current model = Cbow
Current weight = Cbow_500
Current model = Skipgrams
Current weight = Skipgrams_500
Current model = Neg_Skipgrams
Current weight = Neg_Skipgrams_500


In [295]:
len(main_results)

50

In [None]:
# # Remove header
# text = [sent for sent in text if sent[0] != ':']
# print(text[0:3])
# # Lower case
# text_formatted = [sent.lower() for sent in text]
# # Check Corpus formatted
# print(text_formatted[0:3])
