## Importing the libraries

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import time
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
np.__version__, torch.__version__

('1.24.4', '2.1.0+cu121')

In [4]:
import matplotlib
matplotlib.__version__

'3.7.2'

## 1. Load Data

In [5]:
from nltk.corpus import brown

# Create a corpus containing only documents from the 'earn' category
corpus = brown.sents()

# Limit the corpus to the first 1000 sentences for demonstration purposes
corpus = [[word.lower() for word in sentence] for sentence in corpus]
corpus = corpus[:1000]


In [6]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['k.',
 'screw',
 'clash',
 'berlin',
 'fund-raising',
 'cemal',
 'park',
 'sites',
 'timetable',
 'made',
 'together',
 'canvassers',
 'election',
 'intern',
 'agreeing',
 'begin',
 'deeper',
 'avoid',
 'dependency',
 'ill.',
 'assigned',
 'weaver',
 'exert',
 'accounts',
 'jury',
 'donations',
 'second',
 'st.',
 'merger',
 'possibility',
 'merchandise',
 'heart',
 '$60',
 'y.',
 'languages',
 'cabinet',
 'lumber',
 'troubles',
 'suffrage',
 'approach',
 'labor',
 'assign',
 'nugent',
 "france's",
 'sounded',
 '$5',
 'w.',
 'bites',
 'gainesville',
 'whipped',
 'ballot',
 'rising',
 'various',
 'reama',
 'informed',
 'industrial',
 'parsons',
 'application',
 "he's",
 'visited',
 'publicized',
 'trimble',
 'flows',
 'were',
 'fears',
 "byrd's",
 'teamsters',
 'castro',
 'byrd',
 'j.',
 '402',
 'desertion',
 'three',
 'remarkably',
 'restrained',
 'advisory',
 'looked',
 '10,000,000',
 'or',
 'marching',
 'territory',
 "nation's",
 'controversy',
 '$43,000',
 'especially',
 'these',
 

In [7]:
len(vocab)

4272

In [8]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [9]:
#vocab size
voc_size = len(vocab)
print(voc_size)

4272


In [10]:
#append UNK
vocab.append('<UNK>')

In [11]:
vocab

['k.',
 'screw',
 'clash',
 'berlin',
 'fund-raising',
 'cemal',
 'park',
 'sites',
 'timetable',
 'made',
 'together',
 'canvassers',
 'election',
 'intern',
 'agreeing',
 'begin',
 'deeper',
 'avoid',
 'dependency',
 'ill.',
 'assigned',
 'weaver',
 'exert',
 'accounts',
 'jury',
 'donations',
 'second',
 'st.',
 'merger',
 'possibility',
 'merchandise',
 'heart',
 '$60',
 'y.',
 'languages',
 'cabinet',
 'lumber',
 'troubles',
 'suffrage',
 'approach',
 'labor',
 'assign',
 'nugent',
 "france's",
 'sounded',
 '$5',
 'w.',
 'bites',
 'gainesville',
 'whipped',
 'ballot',
 'rising',
 'various',
 'reama',
 'informed',
 'industrial',
 'parsons',
 'application',
 "he's",
 'visited',
 'publicized',
 'trimble',
 'flows',
 'were',
 'fears',
 "byrd's",
 'teamsters',
 'castro',
 'byrd',
 'j.',
 '402',
 'desertion',
 'three',
 'remarkably',
 'restrained',
 'advisory',
 'looked',
 '10,000,000',
 'or',
 'marching',
 'territory',
 "nation's",
 'controversy',
 '$43,000',
 'especially',
 'these',
 

In [12]:
word2index['<UNK>'] = 0

In [13]:
word2index

{'k.': 0,
 'screw': 1,
 'clash': 2,
 'berlin': 3,
 'fund-raising': 4,
 'cemal': 5,
 'park': 6,
 'sites': 7,
 'timetable': 8,
 'made': 9,
 'together': 10,
 'canvassers': 11,
 'election': 12,
 'intern': 13,
 'agreeing': 14,
 'begin': 15,
 'deeper': 16,
 'avoid': 17,
 'dependency': 18,
 'ill.': 19,
 'assigned': 20,
 'weaver': 21,
 'exert': 22,
 'accounts': 23,
 'jury': 24,
 'donations': 25,
 'second': 26,
 'st.': 27,
 'merger': 28,
 'possibility': 29,
 'merchandise': 30,
 'heart': 31,
 '$60': 32,
 'y.': 33,
 'languages': 34,
 'cabinet': 35,
 'lumber': 36,
 'troubles': 37,
 'suffrage': 38,
 'approach': 39,
 'labor': 40,
 'assign': 41,
 'nugent': 42,
 "france's": 43,
 'sounded': 44,
 '$5': 45,
 'w.': 46,
 'bites': 47,
 'gainesville': 48,
 'whipped': 49,
 'ballot': 50,
 'rising': 51,
 'various': 52,
 'reama': 53,
 'informed': 54,
 'industrial': 55,
 'parsons': 56,
 'application': 57,
 "he's": 58,
 'visited': 59,
 'publicized': 60,
 'trimble': 61,
 'flows': 62,
 'were': 63,
 'fears': 64,
 "by

In [14]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

In [15]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'the': 1569,
         ',': 878,
         '.': 857,
         'of': 676,
         'to': 549,
         'a': 440,
         'in': 438,
         'and': 377,
         'for': 266,
         'that': 211,
         '``': 202,
         "''": 191,
         'he': 176,
         'is': 171,
         'on': 165,
         'said': 164,
         'be': 153,
         'by': 143,
         'was': 138,
         'would': 131,
         'it': 119,
         'as': 114,
         'with': 102,
         'has': 100,
         'will': 97,
         'his': 89,
         'at': 84,
         'state': 82,
         'an': 81,
         'not': 78,
         'this': 75,
         'been': 66,
         'which': 65,
         'from': 65,
         'who': 61,
         'are': 58,
         'have': 58,
         '--': 58,
         'but': 57,
         'city': 54,
         'more': 52,
         'mr.': 52,
         'administration': 50,
         'one': 48,
         'new': 48,
         'president': 48,
         'they': 46,
         'had': 45,
  

In [16]:
skip_grams = []

for doc in corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-1], doc[i+1],doc[i+2],doc[i-2]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('county', 'fulton'),
 ('county', 'grand'),
 ('county', 'jury'),
 ('county', 'the'),
 ('grand', 'county'),
 ('grand', 'jury'),
 ('grand', 'said'),
 ('grand', 'fulton'),
 ('jury', 'grand'),
 ('jury', 'said'),
 ('jury', 'friday'),
 ('jury', 'county'),
 ('said', 'jury'),
 ('said', 'friday'),
 ('said', 'an'),
 ('said', 'grand'),
 ('friday', 'said'),
 ('friday', 'an'),
 ('friday', 'investigation'),
 ('friday', 'jury'),
 ('an', 'friday'),
 ('an', 'investigation'),
 ('an', 'of'),
 ('an', 'said'),
 ('investigation', 'an'),
 ('investigation', 'of'),
 ('investigation', "atlanta's"),
 ('investigation', 'friday'),
 ('of', 'investigation'),
 ('of', "atlanta's"),
 ('of', 'recent'),
 ('of', 'an'),
 ("atlanta's", 'of'),
 ("atlanta's", 'recent'),
 ("atlanta's", 'primary'),
 ("atlanta's", 'investigation'),
 ('recent', "atlanta's"),
 ('recent', 'primary'),
 ('recent', 'election'),
 ('recent', 'of'),
 ('primary', 'recent'),
 ('primary', 'election'),
 ('primary', 'produced'),
 ('primary', "atlanta's"),
 (

In [17]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('of', 'the'): 376,
         ('the', 'of'): 368,
         (',', 'the'): 189,
         ('the', ','): 181,
         ('the', 'in'): 173,
         ('in', 'the'): 173,
         ('to', 'the'): 162,
         ('the', 'to'): 162,
         (',', ','): 101,
         (',', 'said'): 92,
         ('for', 'the'): 91,
         ('the', 'for'): 91,
         ('of', 'a'): 86,
         (',', 'and'): 84,
         ('and', ','): 82,
         ('a', 'of'): 80,
         ('the', '.'): 79,
         ('on', 'the'): 76,
         ('the', 'on'): 75,
         ('the', 'and'): 74,
         ('and', 'the'): 74,
         ('said', ','): 70,
         ('to', 'a'): 68,
         ('a', 'to'): 66,
         ('that', 'the'): 65,
         ('the', 'that'): 65,
         (',', 'of'): 64,
         ('of', ','): 63,
         ('he', ','): 63,
         (',', 'he'): 63,
         ("''", ','): 54,
         (',', "''"): 54,
         ('a', ','): 54,
         (',', 'a'): 53,
         (',', 'in'): 46,
         ('he', 'said'): 46,
         (

In [18]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [19]:
from itertools import combinations_with_replacement

X_ik = {}  # for keeping the co-occurrences
weighting_dic = {}  # scaling the percentage of sampling

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram) is not None:  # matches
        co_occer = X_ik_skipgrams[bigram]  # get the count from what we already counted
        X_ik[bigram] = co_occer + 1  # + 1 for stability issue
        X_ik[(bigram[1], bigram[0])] = co_occer + 1  # count also for the opposite
        # print(X_ik[(bigram[1], bigram[0])])  # count also for the opposite
    else:
        pass

    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [20]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]], word2index[doc[i+2]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)


In [21]:
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [22]:
x.shape  #batch_size, 1

(2, 1)

In [23]:
x

array([[3599],
       [1772]])

In [24]:
y.shape

(2, 1)

## 4. Model

In [25]:
len(vocab)

4273

In [26]:
embedding = nn.Embedding(63314, 2)

In [27]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

### 4.1 Skipgram with positive sampling
$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [28]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
        

### 4.2 Skipgram with negative sampling

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [29]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_center = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_outside = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_center(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_outside(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_outside(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### 4.3 Glove

In [30]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

### 4.4 Gensim Model

In [88]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('D:/AIT/Sem2/NLP/NLP_Assignments/glove.6B.100d.txt')
model_gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [32]:
model_gensim['katoucha']

array([-0.258    , -0.068239 , -0.1293   ,  0.40934  ,  0.24704  ,
       -0.30138  ,  0.64817  , -0.39733  ,  0.098767 ,  0.27254  ,
       -0.22292  ,  0.29851  ,  0.38641  , -0.20509  ,  0.13445  ,
        0.1114   , -0.065329 ,  0.60735  ,  0.42454  , -0.16439  ,
       -0.42364  , -0.014459 , -0.49806  , -0.084324 , -0.53053  ,
       -0.14708  , -0.14706  , -0.19774  , -0.065882 ,  0.32439  ,
        0.55762  ,  0.3363   ,  0.35643  , -0.16911  ,  0.29504  ,
       -0.41179  , -0.033898 , -0.34218  ,  0.0972   , -0.14092  ,
       -0.063052 , -0.080234 ,  0.059456 , -0.050595 , -0.42402  ,
        0.41918  , -0.0025027,  0.35303  , -0.070322 ,  0.43291  ,
       -0.26104  ,  0.04959  , -0.30767  ,  0.19803  ,  0.41325  ,
        1.0292   , -0.3959   , -0.014833 , -0.3658   , -0.47339  ,
        0.12888  , -0.45944  ,  0.27612  ,  0.11627  , -0.40329  ,
        0.21118  , -0.38505  , -0.1359   , -0.36774  ,  0.013439 ,
        0.81402  ,  0.23368  ,  0.080804 ,  0.54235  ,  0.3942

## 5.Training

In [33]:
#prepare all vocab

batch_size = 2
voc_size   = len(vocab)
emb_size = 2

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 4270, 4271,    0],
        [   0,    1,    2,  ..., 4270, 4271,    0]])

In [34]:
model_skipgram_positive = Skipgram(voc_size, emb_size)
model_skipgram_positive

Skipgram(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
)

In [35]:
model_glove = Glove(voc_size, emb_size)
model_glove

Glove(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
  (center_bias): Embedding(4273, 1)
  (outside_bias): Embedding(4273, 1)
)

In [36]:
model_skipgram_negative = Skipgram(voc_size, emb_size)
model_skipgram_negative

Skipgram(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
)

In [37]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [38]:
loss_skipgram_positive = model_skipgram_positive(input_tensor, label_tensor, all_vocabs)
loss_skipgram_negative = model_skipgram_negative(input_tensor, label_tensor, all_vocabs)
# x, y, cooc, weighting = random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic)

# loss_glove = model_glove(torch.LongTensor(x), torch.LongTensor(y), torch.LongTensor(cooc), torch.LongTensor(weighting))

In [39]:
batch_size = 2
emb_size   = 2
model_skipgram_positive      = Skipgram(voc_size, emb_size)
optimizer_skipgram_positive  = optim.Adam(model_skipgram_positive.parameters(), lr=0.001)
optimizer_skipgram_negative  = optim.Adam(model_skipgram_negative.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss()
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.001)

In [40]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [41]:
num_epochs = 10
total_start = time.time()

for epoch in range(num_epochs):
    start = time.time()
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss_skipgram_positive = model_skipgram_positive(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer_skipgram_positive.zero_grad()
    loss_skipgram_positive.backward()

    #update alpha
    optimizer_skipgram_positive.step()
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Positive Skigram")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_skipgram_positive:2.6f}| time: {epoch_mins}m {epoch_secs}s")
# Record the ending time
total_end = time.time()

# Calculate and print the total runtime
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")

Positive Skigram
Epoch      1 | Loss: 8.736550| time: 0m 1s
Positive Skigram
Epoch      2 | Loss: 9.588938| time: 0m 0s
Positive Skigram
Epoch      3 | Loss: 8.279503| time: 0m 0s
Positive Skigram
Epoch      4 | Loss: 8.162359| time: 0m 0s
Positive Skigram
Epoch      5 | Loss: 9.460874| time: 0m 0s
Positive Skigram
Epoch      6 | Loss: 7.657609| time: 0m 0s
Positive Skigram
Epoch      7 | Loss: 8.082825| time: 0m 0s
Positive Skigram
Epoch      8 | Loss: 8.297731| time: 0m 0s
Positive Skigram
Epoch      9 | Loss: 8.738638| time: 0m 0s
Positive Skigram
Epoch     10 | Loss: 9.642736| time: 0m 0s
Total runtime: 3.17 seconds


In [42]:
num_epochs = 10


total_start = time.time()
for epoch in range(num_epochs):
    start = time.time()
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss_skipgram_negative = model_skipgram_negative(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer_skipgram_negative.zero_grad()
    loss_skipgram_negative.backward()
    
    #update alpha
    optimizer_skipgram_negative.step()

    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Negative Skigram")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_skipgram_negative:2.6f} | time: {epoch_mins}m {epoch_secs}s")
# Record the ending time
total_end = time.time()

# Calculate and print the total runtime
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")


Negative Skigram
Epoch      1 | Loss: 11.773346 | time: 0m 0s
Negative Skigram
Epoch      2 | Loss: 9.498462 | time: 0m 0s
Negative Skigram
Epoch      3 | Loss: 7.865197 | time: 0m 0s
Negative Skigram
Epoch      4 | Loss: 8.451919 | time: 0m 0s
Negative Skigram
Epoch      5 | Loss: 10.598991 | time: 0m 0s
Negative Skigram
Epoch      6 | Loss: 12.297829 | time: 0m 0s
Negative Skigram
Epoch      7 | Loss: 9.034821 | time: 0m 0s
Negative Skigram
Epoch      8 | Loss: 9.289711 | time: 0m 0s
Negative Skigram
Epoch      9 | Loss: 8.388338 | time: 0m 0s
Negative Skigram
Epoch     10 | Loss: 7.087474 | time: 0m 0s
Total runtime: 1.63 seconds


In [43]:
num_epochs = 10

for epoch in range(num_epochs):
    start = time.time()
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch)
    
    #predict   
    loss_glove = model_glove(input_batch, target_batch, cooc_batch, weighting_batch)
    
    #backprogate
    optimizer_glove.zero_grad()
    loss_glove.backward()
    
    #update alpha
    optimizer_glove.step()

    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Glove")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_glove:2.6f} | time: {epoch_mins}m {epoch_secs}s")
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")

Glove
Epoch      1 | Loss: 8.722761 | time: 0m 0s
Glove
Epoch      2 | Loss: 1.987830 | time: 0m 0s
Glove
Epoch      3 | Loss: 22.940914 | time: 0m 0s
Glove
Epoch      4 | Loss: 0.134398 | time: 0m 0s
Glove
Epoch      5 | Loss: 0.300638 | time: 0m 0s
Glove
Epoch      6 | Loss: 7.932749 | time: 0m 0s
Glove
Epoch      7 | Loss: 0.814005 | time: 0m 0s
Glove
Epoch      8 | Loss: 2.180379 | time: 0m 0s
Glove
Epoch      9 | Loss: 0.327231 | time: 0m 0s
Glove
Epoch     10 | Loss: 0.749558 | time: 0m 0s
Total runtime: 1.63 seconds


| Model           | Window Size  | Training Loss | Training Time (sec) |
|------------------|-------------|---------------|---------------------|
| Skipgram         |      2      |    8.68       |     2.6             |
| Skipgram (NEG)   |      2      |    8.56       |     1.23            |
| Glove            |      2      |    0.8        |     1.23            |
| Glove (Gensim)   |      -      |    -          |      -              |


## 6. Embeddings

In [44]:
def get_embed(model, word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    
    return embed[0][0].item(), embed[0][1].item()

In [45]:
import torch
import numpy as np

def get_embed_for_corpus(model, words):
    embeddings = {}

    for word in words:
        try:
            index = word2index[word]
        except KeyError:
            index = word2index['<UNK>']

        word_tensor = torch.LongTensor([index])

        embed_c = model.embedding_center(word_tensor)
        embed_o = model.embedding_outside(word_tensor)
        embed = (embed_c + embed_o) / 2

        embeddings[word] = np.array([embed[0][0].item(), embed[0][1].item()])

    return embeddings


## 7. Cosine similarity

In [46]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [47]:
def cosine_similarity_for_corpus(embeddings, target_word):
    similarities = []

    target_index = word2index.get(target_word, word2index['<UNK>'])
    target_vector = embeddings[target_index]

    for word, vector in embeddings.items():
        similarity = cosine_similarity(target_vector, vector)
        similarities.append((word, similarity))

    return similarities

## Finding the predicted y using the different models
### Using Word analogies dataset  
Dataset taken from [website](https://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt).


In [48]:
# Specify the path to your .txt file
file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test.v1.txt'

# Read the content of the file
with open(file_path, 'r') as file:
    # Skip the first line
    file.readline()

    # Read the remaining content of the file
    file_content = file.readlines()

# Initialize variables to store relevant lines
total_corpus = []

# Variable to keep track of the current heading
current_heading = None

# Iterate through each line in the file content
for line in file_content:
    # Check if the line is a heading
    if line.startswith(':'):
        current_heading = line.strip()
    else:
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        total_corpus.append(words)


In [49]:
total_corpus

[['athens', 'greece', 'baghdad', 'iraq'],
 ['athens', 'greece', 'bangkok', 'thailand'],
 ['athens', 'greece', 'beijing', 'china'],
 ['athens', 'greece', 'berlin', 'germany'],
 ['athens', 'greece', 'bern', 'switzerland'],
 ['athens', 'greece', 'cairo', 'egypt'],
 ['athens', 'greece', 'canberra', 'australia'],
 ['athens', 'greece', 'hanoi', 'vietnam'],
 ['athens', 'greece', 'havana', 'cuba'],
 ['athens', 'greece', 'helsinki', 'finland'],
 ['athens', 'greece', 'islamabad', 'pakistan'],
 ['athens', 'greece', 'kabul', 'afghanistan'],
 ['athens', 'greece', 'london', 'england'],
 ['athens', 'greece', 'madrid', 'spain'],
 ['athens', 'greece', 'moscow', 'russia'],
 ['athens', 'greece', 'oslo', 'norway'],
 ['athens', 'greece', 'ottawa', 'canada'],
 ['athens', 'greece', 'paris', 'france'],
 ['athens', 'greece', 'rome', 'italy'],
 ['athens', 'greece', 'stockholm', 'sweden'],
 ['athens', 'greece', 'tehran', 'iran'],
 ['athens', 'greece', 'tokyo', 'japan'],
 ['baghdad', 'iraq', 'bangkok', 'thailand'

In [50]:
file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test.v1.txt'

# Read the content of the file
with open(file_path, 'r') as file:
    file_content = file.readlines()

# Initialize variables to store relevant lines
capital_common_countries = []
past_tense = []

# Variable to keep track of the current heading
current_heading = None

# Iterate through each line in the file content
for line in file_content:
    # Check if the line is a heading
    if line.startswith(':'):
        current_heading = line.strip()
    elif current_heading == ': capital-common-countries':
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        capital_common_countries.append(words)
    elif current_heading == ': gram7-past-tense':
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        past_tense.append(words)


In [51]:
total_corpus

[['athens', 'greece', 'baghdad', 'iraq'],
 ['athens', 'greece', 'bangkok', 'thailand'],
 ['athens', 'greece', 'beijing', 'china'],
 ['athens', 'greece', 'berlin', 'germany'],
 ['athens', 'greece', 'bern', 'switzerland'],
 ['athens', 'greece', 'cairo', 'egypt'],
 ['athens', 'greece', 'canberra', 'australia'],
 ['athens', 'greece', 'hanoi', 'vietnam'],
 ['athens', 'greece', 'havana', 'cuba'],
 ['athens', 'greece', 'helsinki', 'finland'],
 ['athens', 'greece', 'islamabad', 'pakistan'],
 ['athens', 'greece', 'kabul', 'afghanistan'],
 ['athens', 'greece', 'london', 'england'],
 ['athens', 'greece', 'madrid', 'spain'],
 ['athens', 'greece', 'moscow', 'russia'],
 ['athens', 'greece', 'oslo', 'norway'],
 ['athens', 'greece', 'ottawa', 'canada'],
 ['athens', 'greece', 'paris', 'france'],
 ['athens', 'greece', 'rome', 'italy'],
 ['athens', 'greece', 'stockholm', 'sweden'],
 ['athens', 'greece', 'tehran', 'iran'],
 ['athens', 'greece', 'tokyo', 'japan'],
 ['baghdad', 'iraq', 'bangkok', 'thailand'

In [52]:
capital_common_countries

[['athens', 'greece', 'baghdad', 'iraq'],
 ['athens', 'greece', 'bangkok', 'thailand'],
 ['athens', 'greece', 'beijing', 'china'],
 ['athens', 'greece', 'berlin', 'germany'],
 ['athens', 'greece', 'bern', 'switzerland'],
 ['athens', 'greece', 'cairo', 'egypt'],
 ['athens', 'greece', 'canberra', 'australia'],
 ['athens', 'greece', 'hanoi', 'vietnam'],
 ['athens', 'greece', 'havana', 'cuba'],
 ['athens', 'greece', 'helsinki', 'finland'],
 ['athens', 'greece', 'islamabad', 'pakistan'],
 ['athens', 'greece', 'kabul', 'afghanistan'],
 ['athens', 'greece', 'london', 'england'],
 ['athens', 'greece', 'madrid', 'spain'],
 ['athens', 'greece', 'moscow', 'russia'],
 ['athens', 'greece', 'oslo', 'norway'],
 ['athens', 'greece', 'ottawa', 'canada'],
 ['athens', 'greece', 'paris', 'france'],
 ['athens', 'greece', 'rome', 'italy'],
 ['athens', 'greece', 'stockholm', 'sweden'],
 ['athens', 'greece', 'tehran', 'iran'],
 ['athens', 'greece', 'tokyo', 'japan'],
 ['baghdad', 'iraq', 'bangkok', 'thailand'

In [53]:
# Flatten the 2D list into a list of lists
flattened_list_of_country = [word for pair in capital_common_countries for word in pair]

# Wrap the flattened list in another list
resulting_capital_list = [flattened_list_of_country]

# Flatten the 2D list into a list of lists
flattened_list_of_past_tense = [word for pair in past_tense for word in pair]

# Wrap the flattened list in another list
resulting_capital_list = [flattened_list_of_country]
resulting_past_tense_list = [flattened_list_of_past_tense]

# Flatten the 2D list into a list of lists
flattened_list_total_words = [word for pair in total_corpus for word in pair]
# Wrap the flattened list in another list
resulting_total_corpus = [flattened_list_total_words]


flatten = lambda l: [item for sublist in l for item in sublist]
capital_list = list(set(flatten(resulting_capital_list)))
past_tense_list = list(set(flatten(resulting_past_tense_list)))
whole_corpus = list(set(flatten(resulting_total_corpus)))

In [54]:
capital_list

['japan',
 'islamabad',
 'berlin',
 'cuba',
 'italy',
 'london',
 'finland',
 'helsinki',
 'beijing',
 'egypt',
 'oslo',
 'hanoi',
 'athens',
 'switzerland',
 'rome',
 'afghanistan',
 'tokyo',
 'norway',
 'havana',
 'spain',
 'germany',
 'vietnam',
 'iran',
 'australia',
 'england',
 'canada',
 'ottawa',
 'bern',
 'kabul',
 'china',
 'bangkok',
 'cairo',
 'paris',
 'baghdad',
 'russia',
 'france',
 'thailand',
 'canberra',
 'moscow',
 'pakistan',
 'sweden',
 'stockholm',
 'tehran',
 'madrid',
 'greece',
 'iraq']

In [55]:
# Now you can use the flat_capital_common_countries list
embed_capital_glove = get_embed_for_corpus(model_glove, capital_list)
embed_capital_skipgram_positive = get_embed_for_corpus(model_skipgram_positive, capital_list)
embed_capital_skipgram_negative = get_embed_for_corpus(model_skipgram_negative, capital_list)

# Flatten the capital_common_countries list
embed_past_tense_glove = get_embed_for_corpus(model_glove, past_tense_list)
embed_past_tense_skipgram_positive = get_embed_for_corpus(model_skipgram_positive, past_tense_list)
embed_past_tense_skipgram_negative = get_embed_for_corpus(model_skipgram_negative, past_tense_list)

# Flatten the capital_common_countries list
embed_total_glove = get_embed_for_corpus(model_glove, whole_corpus)
embed_whole_skipgram_positive = get_embed_for_corpus(model_skipgram_positive, whole_corpus)
embed_whole_skipgram_negative = get_embed_for_corpus(model_skipgram_negative, whole_corpus)


In [75]:
# y_pred for glove for the capital list
y_pred_glove_country = []

for i in capital_common_countries:  # Assuming the embeddings are 2-dimensional
    y = embed_capital_glove[i[1]] - embed_capital_glove[i[0]] + embed_capital_glove[i[2]]
    y_pred_glove_country.append(y)


In [76]:
# y_pred for glove for the past tense list
y_pred_glove_past = []

for i in past_tense:  # Assuming the embeddings are 2-dimensional
    y = embed_past_tense_glove[i[1]] - embed_past_tense_glove[i[0]] + embed_past_tense_glove[i[2]]
    y_pred_glove_past.append(y)


In [77]:
# y_pred for skipgram negative sampling for the capital list
y_pred_neg_samp_country = []

# Replace 'athens', 'greece', 'bangkok' with your specific words
for i in capital_common_countries:  # Assuming the embeddings are 2-dimensional
    y = embed_capital_skipgram_negative[i[1]] - embed_capital_skipgram_negative[i[0]] + embed_capital_skipgram_negative[i[2]]
    y_pred_neg_samp_country.append(y)


In [78]:
# y_pred for skipgram negative sampling for the past tense list
y_pred_neg_samp_past = []

# Replace 'athens', 'greece', 'bangkok' with your specific words
for i in past_tense:  # Assuming the embeddings are 2-dimensional
    y = embed_past_tense_skipgram_negative[i[0]] - embed_past_tense_skipgram_negative[i[0]] + embed_past_tense_skipgram_negative[i[2]]
    y_pred_neg_samp_past.append(y)


In [79]:
# y_pred for skipgram positive sampling for the country list
y_pred_positive_samp_country = []

# Replace 'athens', 'greece', 'bangkok' with your specific words
for i in capital_common_countries:  # Assuming the embeddings are 2-dimensional
    y = embed_capital_skipgram_positive[i[1]] - embed_capital_skipgram_positive[i[0]] + embed_capital_skipgram_positive[i[2]]
    y_pred_positive_samp_country.append(y)


In [80]:
# y_pred for skipgram positive sampling for the past tense list
y_pred_positive_past_tense = []

# Replace 'athens', 'greece', 'bangkok' with your specific words
for i in past_tense:  # Assuming the embeddings are 2-dimensional
    y = embed_past_tense_skipgram_positive[i[1]] - embed_past_tense_skipgram_positive[i[0]] + embed_past_tense_skipgram_positive[i[2]]
    y_pred_positive_past_tense.append(y)


In [81]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [82]:
def find_max_cosine_words(y_pred, embeddings):
    """
    Find the word with the maximum cosine similarity for each vector in y_pred.

    Parameters:
    - y_pred: List of vectors for which to find the max cosine similarity words.
    - embeddings: Dictionary of word embeddings.

    Returns:
    - List of words with the maximum cosine similarity for each vector in y_pred.
    """
    max_cosine_words = []

    for j in range(len(y_pred)):
        max_cosine = -1
        max_cosine_word = ""

        for i in embeddings.keys():
            cosine_temp = cosine_similarity(y_pred[j], embeddings[i])

            if cosine_temp > max_cosine:
                max_cosine_word = i
                max_cosine = cosine_temp

        max_cosine_words.append(max_cosine_word)

    return max_cosine_words

# Example usage:
cosine_neg_samp_syntatical = find_max_cosine_words(y_pred_neg_samp_country, embed_capital_skipgram_negative)
cosine_positive_samp_syntatical = find_max_cosine_words(y_pred_positive_samp_country, embed_capital_skipgram_positive)
cosine_glove_syntatical = find_max_cosine_words(y_pred_glove_country, embed_capital_glove)


In [143]:
from heapq import nlargest

def find_next_10_cosine_words_for_word(target_word, embeddings, top_n=10):
    """
    Find the next 10 words with the maximum cosine similarity for a user-provided specific word.

    Parameters:
    - target_word: The word for which to find the next 10 cosine similarity words.
    - embeddings: Dictionary of word embeddings.
    - top_n: Number of top words to retrieve for the target word (default is 10).

    Returns:
    - List of the next 10 words with the maximum cosine similarity for the target word or ["Word not in Corpus"].
    """
    if target_word not in embeddings:
        return ["Word not in Corpus"]

    target_vector = embeddings[target_word]
    cosine_similarities = [(word, cosine_similarity(target_vector, embeddings[word])) for word in embeddings.keys()]
    top_n_words = nlargest(top_n + 1, cosine_similarities, key=lambda x: x[1])

    # Exclude the target word itself
    top_n_words = [word for word, _ in top_n_words if word != target_word]

    return top_n_words[:10]

# Example usage:
user_target_word = 'greece'
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(user_target_word, embed_whole_skipgram_negative, top_n=10)

# Print the results
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("Word not in Corpus")
else:
    print(f"Next 10 similar words for user-provided word '{user_target_word}': {next_10_cosine_for_user_word}")


Next 10 similar words for user-provided word 'greece': ['massachusetts', 'good', 'unpleasant', 'look', 'most', 'play', 'stronger', 'speak', 'wife', 'saw']


## 9. Accuracy

### Syntatical Accuracy

In [84]:
def calculate_accuracy(predictions, true_words):
    """
    Calculate accuracy based on predictions and true words.

    Parameters:
    - predictions: List of predicted words.
    - true_words: List of true words.

    Returns:
    - Accuracy as a percentage.
    """
    total_trials = len(predictions)
    total_correct = sum(1 for pred_word in predictions if pred_word in true_words)

    accuracy = (total_correct / total_trials) * 100

    return accuracy

# Example usage:
syntatical_accuracy_neg_samp = calculate_accuracy(find_max_cosine_words(y_pred_neg_samp_country, embed_whole_skipgram_negative), [true_word[3] for true_word in capital_common_countries])
syntatical_accuracy_pos_samp = calculate_accuracy(find_max_cosine_words(y_pred_positive_samp_country, embed_whole_skipgram_positive), [true_word[3] for true_word in capital_common_countries])
syntatical_accuracy_glove = calculate_accuracy(find_max_cosine_words(y_pred_glove_country, embed_total_glove), [true_word[3] for true_word in capital_common_countries])
print("Syntatical Accuracy of Skipgram Negative: {:.10f}%".format(syntatical_accuracy_neg_samp))
print("Syntatical Accuracy of Skipgram Positive: {:.10f}%".format(syntatical_accuracy_pos_samp))
print("Syntatical Accuracy of Glove: {:.10f}%".format(syntatical_accuracy_glove))


Syntatical Accuracy of Skipgram Negative: 14.2292490119%
Syntatical Accuracy of Skipgram Positive: 14.4268774704%
Syntatical Accuracy of Glove: 14.6245059289%


In [85]:
input_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test.v1.txt'
output_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test-without-first-line.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Write all lines except the first line to the output file
    output_file.writelines(lines[1:])

print(f"First line removed and content saved to: {output_file_path}")


First line removed and content saved to: D:/AIT/Sem2/NLP/NLP_Assignment/word-test-without-first-line.txt


In [86]:
input_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test.v1.txt'
output_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/capital.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether to start writing lines
    start_writing = False

    # Iterate through lines
    for line in lines:
        # Check if the line starts with ': gram7-past-tense'
        if line.startswith(': capital-common-countries'):
            # Set the flag to start writing
            start_writing = True
        elif line.startswith(':'):
            # If a new section header is encountered, stop writing
            start_writing = False

        # Write lines to the output file if the flag is True
        if start_writing:
            output_file.write(line)

print(f"Lines starting with ': capital-countries' saved to: {output_file_path}")


Lines starting with ': capital-countries' saved to: D:/AIT/Sem2/NLP/NLP_Assignment/capital.txt


In [89]:
analogy_score_syn = model_gensim.evaluate_word_analogies(datapath('D:/AIT/Sem2/NLP/NLP_Assignment/capital.txt'))
print("Syntatical Accuracy of Model Gensim:", analogy_score_syn[0])

Syntatical Accuracy of Model Gensim: 0.9387351778656127


In [90]:
def calculate_accuracy(predictions, true_words):
    """
    Calculate accuracy based on predictions and true words.

    Parameters:
    - predictions: List of predicted words.
    - true_words: List of true words.

    Returns:
    - Accuracy as a percentage.
    """
    total_trials = len(predictions)
    total_correct = sum(1 for pred_word in predictions if pred_word in true_words)

    accuracy = (total_correct / total_trials) * 100

    return accuracy

# Example usage:
sematic_accuracy_neg_samp = calculate_accuracy(find_max_cosine_words(y_pred_neg_samp_past, embed_whole_skipgram_negative), [true_word[3] for true_word in past_tense])
sematic_accuracy_pos_samp = calculate_accuracy(find_max_cosine_words(y_pred_positive_past_tense, embed_whole_skipgram_positive), [true_word[3] for true_word in past_tense])
sematic_accuracy_glove = calculate_accuracy(find_max_cosine_words(y_pred_glove_past, embed_total_glove), [true_word[3] for true_word in past_tense])
print("Sematic Accuracy of Skipgram Negative: {:.2f}%".format(sematic_accuracy_neg_samp))
print("Sematic Accuracy of Skipgram Positive: {:.2f}%".format(sematic_accuracy_pos_samp))
print("Sematic Accuracy of Glove: {:.2f}%".format(sematic_accuracy_glove))


Sematic Accuracy of Skipgram Negative: 0.00%
Sematic Accuracy of Skipgram Positive: 14.49%
Sematic Accuracy of Glove: 12.44%


In [91]:
input_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/word-test.v1.txt'
output_file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/past_tense_lines.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether to start writing lines
    start_writing = False

    # Iterate through lines
    for line in lines:
        # Check if the line starts with ': gram7-past-tense'
        if line.startswith(': gram7-past-tense'):
            # Set the flag to start writing
            start_writing = True
        elif line.startswith(':'):
            # If a new section header is encountered, stop writing
            start_writing = False

        # Write lines to the output file if the flag is True
        if start_writing:
            output_file.write(line)

print(f"Lines starting with ': gram7-past-tense' saved to: {output_file_path}")


Lines starting with ': gram7-past-tense' saved to: D:/AIT/Sem2/NLP/NLP_Assignment/past_tense_lines.txt


In [94]:
analogy_score_sem = model_gensim.evaluate_word_analogies(datapath('D:/AIT/Sem2/NLP/NLP_Assignment/past_tense_lines.txt'))
print("Semantic Accuracy of Model Gensim:", analogy_score_sem[0])

Semantic Accuracy of Model Gensim: 0.5544871794871795


### Comparision between models


| Model            | Window Size | Training Loss | Training Time(sec) | Syntactic Accuracy | Semantic Accuracy |
|------------------|-------------|---------------|--------------------|--------------------|-------------------|
| Skipgram         |      2      |      8.68     |      2.6           |        14.43%      |     14.49%        |
| Skipgram (NEG)   |      2      |      8.56     |      1.23          |        14.23%      |     0.00%         |
| Glove            |      2      |      0.8      |      1.23          |        14.62%      |     12.44%        |
| Glove (Gensim)   |      -      |       -       |       -            |         93.8%      |     0.55%         |


## Using similarity dataset
Dataset taken from  [website](http://alfonseca.org/eng/research/wordsim353.html).

In [None]:
import pandas as pd

# Specify the path to your text file
file_path = 'D:/AIT/Sem2/NLP/NLP_Assignment/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'

# Define the column names
columns = ['word_1', 'word_2', 'similarity_index']

# Read the text file into a pandas DataFrame with specified column names
df = pd.read_csv(file_path, sep='\t', header=None, names=columns)

df


Unnamed: 0,word_1,word_2,similarity_index
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [None]:
get_embed(model_skipgram_negative,'<UNK>')

(-0.014540418982505798, 0.23793870210647583)

In [None]:
# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    word_1 = row['word_1']
    word_2 = row['word_2']

    try:
        # Attempt to get embeddings and compute the dot product
        embed_1_neg_samp = get_embed(model_skipgram_negative, word_1)
        embed_2_neg_samp = get_embed(model_skipgram_negative, word_2)
        embed_1_pos_samp = get_embed(model_skipgram_positive, word_1)
        embed_2_pos_samp = get_embed(model_skipgram_positive, word_2)
        embed_1_glove = get_embed(model_glove, word_1)
        embed_2_glove = get_embed(model_glove, word_2)

    except KeyError:
        # Handle the case where one or both words are not present in the model
        # Replace missing embeddings with the embedding of '<UNK>' or any other suitable value
        embed_1_neg_samp = get_embed(model_skipgram_negative, '<UNK>')
        embed_2_neg_samp = get_embed(model_skipgram_negative, '<UNK>')
        embed_1_pos_samp = get_embed(model_skipgram_positive, '<UNK>')
        embed_2_pos_samp = get_embed(model_skipgram_positive, '<UNK>')
        embed_1_glove = get_embed(model_glove, '<UNK>')
        embed_2_glove = get_embed(model_glove, '<UNK>')

    # Compute the dot product and update the DataFrame
    df.at[index, 'dot_product_neg_samp'] = np.dot(embed_1_neg_samp, embed_2_neg_samp)
    df.at[index, 'dot_product_pos_samp'] = np.dot(embed_1_pos_samp, embed_2_pos_samp)
    df.at[index, 'dot_product_glove'] = np.dot(embed_1_glove, embed_2_glove)

df

Unnamed: 0,word_1,word_2,similarity_index,dot_product_neg_samp,dot_product_pos_samp,dot_product_glove
0,tiger,cat,7.35,0.056826,3.360918,1.155388
1,tiger,tiger,10.00,0.056826,3.360918,1.155388
2,plane,car,5.77,0.056826,3.360918,1.155388
3,train,car,6.31,0.056826,3.360918,1.155388
4,television,radio,6.77,0.086075,-0.084377,0.101945
...,...,...,...,...,...,...
198,rooster,voyage,0.62,0.056826,3.360918,1.155388
199,noon,string,0.54,0.056826,3.360918,1.155388
200,chord,smile,0.54,0.056826,3.360918,1.155388
201,professor,cucumber,0.31,0.056826,3.360918,1.155388


In [None]:
from scipy.stats import spearmanr

# Compute the Spearman correlation between the provided similarity scores and your models' dot products
correlation_neg, _ = spearmanr(df['similarity_index'], df['dot_product_neg_samp'])
correlation_pos, _ = spearmanr(df['similarity_index'], df['dot_product_pos_samp'])
correlation_glove, _ = spearmanr(df['similarity_index'], df['dot_product_glove'])


# Display the correlation coefficient
print(f"Spearman Correlation Coefficient of Skipgram Negative: {correlation_neg:.4f}")
print(f"Spearman Correlation Coefficient of Skipgram Positive: {correlation_pos:.4f}")
print(f"Spearman Correlation Coefficient of Glove: {correlation_glove:.4f}")

Spearman Correlation Coefficient of Skipgram Negative: 0.0568
Spearman Correlation Coefficient of Skipgram Positive: 0.0320
Spearman Correlation Coefficient of Glove: 0.0204


In [None]:
# Assuming df is your DataFrame
y_true = df['similarity_index'].mean()

print(f"y_true: {y_true:.2f}")


y_true: 5.13


In [None]:
correlation_coefficient = model_gensim.evaluate_word_pairs(datapath('D:/AIT/Sem2/NLP/NLP_Assignments/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'))
print(f"Correlation coefficient: {correlation_coefficient[1][0]:.2f}")
# correlation_coefficient

Correlation coefficient: 0.60


| Model           | Skipgram (POS) | Skipgram (NEG)   | GloVe   | GloVe (gensim) | Y true     |
|-----------------|----------------|------------------|---------|----------------|------------|
| MSE             |     0.0568     |     0.0320       | 0.0204  | 0.60           | 5.13       |



## Model Dump

In [None]:
import pickle

# Assuming you have a Gensim Word2Vec model named 'model'
# You can replace 'Word2Vec' with the specific Gensim model you are using

# Save the Gensim model to a file using pickle
gensim_model_path = 'model/model_gensim.pkl'

with open(gensim_model_path, 'wb') as model_file:
    pickle.dump(model_gensim, model_file)

print(f"Gensim model saved to: {gensim_model_path}")


Gensim model saved to: model/model_gensim.pkl


In [None]:
# Specify the path to your pickled Gensim model file
gensim_model_path = 'model/model_gensim.pkl'

# Load the Gensim model from the pickle file
with open(gensim_model_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [None]:
for i in range (1,10):
    print(loaded_model.most_similar('language')[i][0])


word
spoken
arabic
english
dialect
vocabulary
text
translation
words


In [110]:
import pickle

# Assuming embed_capital_skipgram_negative is your embedding dictionary
embedding_dict = embed_whole_skipgram_positive

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_skipgram_positive.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_skipgram_positive.pkl


In [117]:
import pickle

# Assuming embed_capital_skipgram_negative is your embedding dictionary
embedding_dict = embed_whole_skipgram_negative

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_skipgram_negative.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_skipgram_negative.pkl


In [144]:
import pickle

# Assuming embed_capital_skipgram_negative is your embedding dictionary
embedding_dict = embed_total_glove

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_glove.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_glove.pkl


In [126]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_skipgram_positive.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_neg = pickle.load(pickle_file)


In [118]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_skipgram_negative.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_pos = pickle.load(pickle_file)


In [147]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_glove.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_glove = pickle.load(pickle_file)


In [149]:
user_target_word = "thailand"
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(user_target_word, embedding_dict_glove, top_n=10)

# Print the results
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("Word not in Corpus")
else:
    print(f"Next 10 similar words for user-provided word '{user_target_word}': {next_10_cosine_for_user_word}")

Next 10 similar words for user-provided word 'thailand': ['irish', 'faster', 'walk', 'illinois', 'cuba', 'play', 'largest', 'say', 'quickly', 'greatest']


## Observations

The window size, length of corpus plays a very important hand in determining the accuracy of a model.
The models from scratch (Skipgram (positive sampling, negative sampling), Glove) performed quite badly when compared to the Gensim model as they are trained with a low corpus size (1000) in a low epoch time of 10 and in a reduced window size of 2. 
Similarly, increasing the corpus size lead to issues in the glove model while finding the co-occurence size.
The training loss of Glove is significantly lower in comparision to Skipgram and Skipgram (NEG) sugeesting that the Glove has learned efficiently during the training which might be due to the fact it is a simple model.
While Skipgram (NEG) has the shortest training time, which could be attributed to the negative sampling technique used in training.
Whereas Skipgram Positive presented itself with better syntatical accuracy which might present itself as a task of positive sampling. Furthermore, the spearmans's correlation (which measures the monotonic relationship between two variables. A higher absolute value indicates a stronger monotonic relationship) shows GloVe (Gensim) has the highest absolute Spearman's correlation coefficient (0.60), indicating a relatively stronger monotonic relationship between its predictions and the true values.
