# A1 - That's What I LIKE

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
nltk.download('brown')


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/santhosh/Documents/DSAI/Semester 2/NLP/A1 Assignment/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/santhosh/Documents/DSAI/Semester 2/NLP/A1 Assignment/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/santhosh/Documents/DSAI/Semester 2/NLP/A1 Assignm

True

In [2]:
np.__version__, torch.__version__

('2.4.1', '2.2.2')

In [3]:
import matplotlib
matplotlib.__version__

'3.10.8'

## 1. Loading data

In [4]:
from nltk.corpus import brown

# Create a corpus containing only documents from the 'earn' category
corpus = brown.sents()

# Limit the corpus to the first 1000 sentences for demonstration purposes
corpus = [[word.lower() for word in sentence] for sentence in corpus]
corpus = corpus[:1000]

### Dataset Source and Attribution

The dataset used in this assignment is the **Brown Corpus**, accessed through the Natural Language Toolkit (NLTK) library.  
The Brown Corpus is a publicly available, balanced corpus of American English texts compiled at Brown University in the 1960s. It contains approximately one million words collected from a wide range of text categories, including news, editorial writing, fiction, and academic prose.

The corpus was accessed using the NLTK Python library and is distributed for research and educational purposes.

**Source:**
- Francis, W. N., & Kuƒçera, H. (1964). *Brown Corpus*. Brown University.
- NLTK Project: https://www.nltk.org/

The dataset was used solely for academic purposes in accordance with the assignment requirements.

In [5]:
corpus

[['the',
  'fulton',
  'county',
  'grand',
  'jury',
  'said',
  'friday',
  'an',
  'investigation',
  'of',
  "atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['the',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'city',
  'executive',
  'committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'city',
  'of',
  'atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.'],
 ['the',
  'september-october',
  'term',
  'jury',
  'had',
  'been',
  'charged',
  'by',
  'fulton',
  'superior',
  'court',
  'judge',
  'durwood',
  'pye',
  'to',
  'investigate',
  'reports',
  'of',
  'possible',
  '``',
  'irregularities',
  "''",
 

## 2. Numeralization

In [None]:
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [7]:
len(vocabs)

4272

In [8]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
print(word2index)



In [None]:
vocabs.append('<UNK>')
word2index['<UNK>'] = 0

In [10]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'required'

## 3. Build Co-occurence Matrix X

In [12]:
from collections import Counter
# index the corpus
X_i = Counter(flatten(corpus))

In [13]:
skip_grams = []
# Prepare the skipgram

for doc in corpus:
    # The skipgram has a window size of 2
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-1], doc[i+1],doc[i+2],doc[i-2]]
        for each_out in outside:
            skip_grams.append((center, each_out))

In [14]:
X_ik_skipgrams = Counter(skip_grams)

In [15]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [16]:
from itertools import combinations_with_replacement

X_ik = {}  # for keeping the co-occurrences
weighting_dic = {}  # scaling the percentage of sampling

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram) is not None:  # matches
        co_occer = X_ik_skipgrams[bigram]  # get the count from what we already counted
        X_ik[bigram] = co_occer + 1  # + 1 for stability issue
        X_ik[(bigram[1], bigram[0])] = co_occer + 1  # count also for the opposite
        # print(X_ik[(bigram[1], bigram[0])])  # count also for the opposite
    else:
        pass

    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 4. Preparing the train data

In [17]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 4 words
            outside = (word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]], word2index[doc[i-2]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)


In [18]:
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [19]:
x.shape  #batch_size, 1

(2, 1)

In [20]:
x , y.shape

(array([[4195],
        [2313]]),
 (2, 1))

## 5. Model

In [21]:
len(vocabs)

4273

In [22]:
embedding = nn.Embedding(63314, 2)

In [23]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

### 5.1 Skipgram 
$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [24]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
        

### 5.2 Skipgram with negative sampling

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [25]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_center = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_outside = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_center(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_outside(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_outside(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### 5.3 GloVe

In [26]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

### 5.4 Gensim Model

In [27]:
import os
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# setting the dataset
glove_file = datapath(os.path.abspath('glove.6B.100d.txt'))
model_gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

## 6. Training

In [28]:
#prepare all vocab of batch - 2 , vocab - 2 and embed - 2

batch_size = 2
voc_size   = len(vocabs)
emb_size = 2

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 4270, 4271,    0],
        [   0,    1,    2,  ..., 4270, 4271,    0]])

### Preparing the Models

In [29]:
model_skipgram_positive = Skipgram(voc_size, emb_size)
model_skipgram_positive

Skipgram(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
)

In [30]:
model_skipgram_negative = SkipgramNegSampling(voc_size, emb_size)
model_skipgram_negative

SkipgramNegSampling(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
  (logsigmoid): LogSigmoid()
)

In [31]:
model_glove = Glove(voc_size, emb_size)
model_glove

Glove(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
  (center_bias): Embedding(4273, 1)
  (outside_bias): Embedding(4273, 1)
)

In [32]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [None]:
loss_skipgram_positive = model_skipgram_positive(input_tensor, label_tensor, all_vocabs)
loss_skipgram_negative = model_skipgram_negative(input_tensor, label_tensor, all_vocabs)

In [34]:
batch_size = 2
emb_size   = 2
model_skipgram_positive      = Skipgram(voc_size, emb_size)
optimizer_skipgram_positive  = optim.Adam(model_skipgram_positive.parameters(), lr=0.001)
optimizer_skipgram_negative  = optim.Adam(model_skipgram_negative.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss()
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.001)

### Training the Models

In [35]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [108]:
import time

# Define the number of training epochs for the Skip-gram model
num_epochs = 10

# Record the start time of the full training process
total_start = time.time()

for epoch in range(num_epochs):
    # Track the start time of the current epoch
    start = time.time()
    
    # Sample a random batch of (center, context) word pairs from the corpus
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    # Convert batch data into tensors required by the PyTorch model
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    # Forward pass: compute the Skip-gram loss for positive word-context pairs
    loss_skipgram_positive = model_skipgram_positive(
        input_tensor, label_tensor, all_vocabs
    )
    
    # Clear previously accumulated gradients before backpropagation
    optimizer_skipgram_positive.zero_grad()
    
    # Perform backpropagation to compute gradients of the loss
    loss_skipgram_positive.backward()

    # Update model parameters using the optimizer
    optimizer_skipgram_positive.step()
    
    # Track the end time of the current epoch
    end = time.time()
    
    # Convert elapsed time into minutes and seconds for readability
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    # Display training progress and loss value for the current epoch
    print("Skipgram")
    print(
        f"Epoch {epoch+1:6.0f} | "
        f"Loss: {loss_skipgram_positive:2.6f} | "
        f"time: {epoch_mins}m {epoch_secs}s"
    )

# Record the end time of the full training process
total_end = time.time()

# Compute and report total training duration across all epochs
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")


Skipgram
Epoch      1 | Loss: 8.007629 | time: 0m 0s
Skipgram
Epoch      2 | Loss: 10.524201 | time: 0m 0s
Skipgram
Epoch      3 | Loss: 8.741486 | time: 0m 0s
Skipgram
Epoch      4 | Loss: 9.961153 | time: 0m 0s
Skipgram
Epoch      5 | Loss: 11.585381 | time: 0m 0s
Skipgram
Epoch      6 | Loss: 12.328962 | time: 0m 0s
Skipgram
Epoch      7 | Loss: 9.563849 | time: 0m 0s
Skipgram
Epoch      8 | Loss: 9.461616 | time: 0m 0s
Skipgram
Epoch      9 | Loss: 10.455551 | time: 0m 0s
Skipgram
Epoch     10 | Loss: 8.846746 | time: 0m 0s
Total runtime: 1.90 seconds


In [None]:
# Number of training epochs for Skip-gram with negative sampling
num_epochs = 10

# Record the start time of the full training process
total_start = time.time()

for epoch in range(num_epochs):
    # Track the start time of the current training epoch
    start = time.time()
    
    # Randomly sample a batch of center words and corresponding context labels
    # from the training corpus
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    # Convert batch data into PyTorch tensors for model input
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    # Forward pass: compute loss using Skip-gram with negative sampling,
    # which contrasts true context words against randomly sampled negatives
    loss_skipgram_negative = model_skipgram_negative(
        input_tensor, label_tensor, all_vocabs
    )
    
    # Reset accumulated gradients before performing backpropagation
    optimizer_skipgram_negative.zero_grad()
    
    # Backpropagate the negative sampling loss to compute parameter gradients
    loss_skipgram_negative.backward()
    
    # Update model parameters using the optimizer step
    optimizer_skipgram_negative.step()

    # Track the end time of the current epoch
    end = time.time()
    
    # Convert elapsed epoch time into minutes and seconds
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    # Display training progress and loss for the current epoch
    print("Negative Skipgram")
    print(
        f"Epoch {epoch+1:6.0f} | "
        f"Loss: {loss_skipgram_negative:2.6f} | "
        f"time: {epoch_mins}m {epoch_secs}s"
    )

# Record the end time of the full training loop
total_end = time.time()

# Compute and display the total training runtime
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")


Negative Skigram
Epoch      1 | Loss: 3498.921875 | time: 0m 0s
Negative Skigram
Epoch      2 | Loss: 3445.106934 | time: 0m 0s
Negative Skigram
Epoch      3 | Loss: 3359.545410 | time: 0m 0s
Negative Skigram
Epoch      4 | Loss: 3343.957520 | time: 0m 0s
Negative Skigram
Epoch      5 | Loss: 3275.885742 | time: 0m 0s
Negative Skigram
Epoch      6 | Loss: 3123.142334 | time: 0m 0s
Negative Skigram
Epoch      7 | Loss: 3869.977051 | time: 0m 0s
Negative Skigram
Epoch      8 | Loss: 3641.400391 | time: 0m 0s
Negative Skigram
Epoch      9 | Loss: 3751.013184 | time: 0m 0s
Negative Skigram
Epoch     10 | Loss: 4184.029297 | time: 0m 0s
Total runtime: 2.07 seconds


In [None]:
# Number of training epochs for the GloVe model
num_epochs = 10

for epoch in range(num_epochs):
    # Track the start time of the current training epoch
    start = time.time()
    
    # Sample a random batch of word pairs along with their
    # co-occurrence counts and corresponding weighting factors
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(
        batch_size, corpus, skip_grams, X_ik, weighting_dic
    )
    
    # Convert batch data into tensors required for GloVe optimization
    input_batch      = torch.LongTensor(input_batch)      # center word indices
    target_batch     = torch.LongTensor(target_batch)     # context word indices
    cooc_batch       = torch.FloatTensor(cooc_batch)      # co-occurrence values
    weighting_batch  = torch.FloatTensor(weighting_batch) # weighting function outputs
    
    # Forward pass: compute GloVe loss based on weighted
    # differences between predicted and observed co-occurrence statistics
    loss_glove = model_glove(
        input_batch, target_batch, cooc_batch, weighting_batch
    )
    
    # Clear previously accumulated gradients before backpropagation
    optimizer_glove.zero_grad()
    
    # Backpropagate the GloVe loss to compute parameter gradients
    loss_glove.backward()
    
    # Upda


Glove
Epoch      1 | Loss: 5.812465 | time: 0m 0s
Glove
Epoch      2 | Loss: 5.048025 | time: 0m 0s
Glove
Epoch      3 | Loss: 0.075407 | time: 0m 0s
Glove
Epoch      4 | Loss: 0.176882 | time: 0m 0s
Glove
Epoch      5 | Loss: 0.172586 | time: 0m 0s
Glove
Epoch      6 | Loss: 3.974971 | time: 0m 0s
Glove
Epoch      7 | Loss: 0.127939 | time: 0m 0s
Glove
Epoch      8 | Loss: 0.187980 | time: 0m 0s
Glove
Epoch      9 | Loss: 0.584591 | time: 0m 0s
Glove
Epoch     10 | Loss: 0.031533 | time: 0m 0s
Total runtime: 6.59 seconds


## 7. Embeddings

In [39]:
def get_embed(model, word):
    try:
        # Find the index
        index = word2index[word]
    except:
        # if not found give the index of unknown token
        index = word2index['<UNK>']
    
    # get the word in terms of tensor
    word = torch.LongTensor([word2index[word]])
     # embed the center and the outside word and then find the final embed
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    
    return embed[0][0].item(), embed[0][1].item()

In [40]:
import torch
import numpy as np

def get_embed_for_corpus(model, words):
    embeddings = {}

    for word in words:
        try:
            index = word2index[word]
        except KeyError:
            index = word2index['<UNK>']

        word_tensor = torch.LongTensor([index])

        embed_c = model.embedding_center(word_tensor)
        embed_o = model.embedding_outside(word_tensor)
        embed = (embed_c + embed_o) / 2

        # return as dictionary with key as the word and value as the array of its embedding
        embeddings[word] = np.array([embed[0][0].item(), embed[0][1].item()])

    return embeddings


## 8. Cosine Similarity

In [41]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [None]:
def cosine_similarity_for_corpus(embeddings, target_word):
    # Store similarity scores between the target word and all words in the vocabulary
    similarities = []

    # Retrieve the index of the target word; fall back to the '<UNK>' token
    # if the word is not present in the vocabulary
    target_index = word2index.get(target_word, word2index['<UNK>'])
    
    # Extract the embedding vector corresponding to the target word
    target_vector = embeddings[target_index]

    # Compute cosine similarity between the target word vector
    # and every other word embedding in the corpus
    for word, vector in embeddings.items():
        similarity = cosine_similarity(target_vector, vector)
        similarities.append((word, similarity))

    # Return a list of words paired with their similarity scores
    return similarities


## 9. Predicting using different models from Word anologies dataset

In [None]:
# Path to the Word2Vec analogy evaluation dataset
file_path = 'word-test.v1.txt'

# Open the analogy dataset file and read its contents
with open(file_path, 'r') as file:
    # Skip the header line containing dataset metadata
    file.readline()

    # Read all remaining lines containing analogy groups and word pairs
    file_content = file.readlines()

# List to store parsed word analogy entries
total_corpus = []

# Variable used to track the current analogy category (e.g., semantic or syntactic)
current_heading = None

# Process each line in the dataset
for line in file_content:
    # Lines starting with ':' indicate a new analogy category
    if line.startswith(':'):
        current_heading = line.strip()
    else:
        # Split each analogy into individual words and normalize to lowercase
        # to ensure consistency with the embedding vocabulary
        words = [word.lower() for word in line.strip().split()]
        total_corpus.append(words)


In [None]:
# Containers for storing specific analogy categories required for evaluation
capital_common_countries = []
past_tense = []

# Variable used to track the currently active analogy category
current_heading = None

# Iterate through each line of the analogy dataset
for line in file_content:
    # Lines beginning with ':' indicate a change in analogy category
    if line.startswith(':'):
        current_heading = line.strip()

    # Extract semantic analogies related to capital‚Äìcountry relationships
    elif current_heading == ': capital-common-countries':
        words = [word.lower() for word in line.strip().split()]
        capital_common_countries.append(words)

    # Extract syntactic analogies related to verb tense transformations
    elif current_heading == ': gram7-past-tense':
        words = [word.lower() for word in line.strip().split()]
        past_tense.append(words)


In [None]:
# Flatten semantic analogy pairs (capital‚Äìcountry) into a single list of words
flattened_list_of_country = [
    word for pair in capital_common_countries for word in pair
]

# Wrap the flattened list to maintain consistent list-of-lists structure
resulting_capital_list = [flattened_list_of_country]


# Flatten syntactic analogy pairs (past tense) into a single list of words
flattened_list_of_past_tense = [
    word for pair in past_tense for word in pair
]

# Wrap the flattened list to preserve structural consistency
resulting_past_tense_list = [flattened_list_of_past_tense]


# Flatten all words appearing in the analogy dataset for vocabulary coverage analysis
flattened_list_total_words = [
    word for pair in total_corpus for word in pair
]

# Wrap the flattened corpus words into a list-of-lists format
resulting_total_corpus = [flattened_list_total_words]


# Helper function to flatten nested lists
flatten = lambda l: [item for sublist in l for item in sublist]

# Extract unique words used in semantic analogy evaluation
capital_list = list(set(flatten(resulting_capital_list)))

# Extract unique words used in syntactic analogy evaluation
past_tense_list = list(set(flatten(resulting_past_tense_list)))

# Extract the full set of unique words appearing across the analogy dataset
whole_corpus = list(set(flatten(resulting_total_corpus)))


In [None]:
# Extract embeddings for words used in semantic analogy evaluation (capital‚Äìcountry)
# from each trained model for fair comparison
embed_capital_glove = get_embed_for_corpus(model_glove, capital_list)
embed_capital_skipgram_positive = get_embed_for_corpus(
    model_skipgram_positive, capital_list
)
embed_capital_skipgram_negative = get_embed_for_corpus(
    model_skipgram_negative, capital_list
)

# Extract embeddings for words used in syntactic analogy evaluation (past tense)
# to assess how well each model captures grammatical relationships
embed_past_tense_glove = get_embed_for_corpus(model_glove, past_tense_list)
embed_past_tense_skipgram_positive = get_embed_for_corpus(
    model_skipgram_positive, past_tense_list
)
embed_past_tense_skipgram_negative = get_embed_for_corpus(
    model_skipgram_negative, past_tense_list
)

# Extract embeddings for all words appearing in the analogy dataset
# to support full similarity and correlation-based evaluation
embed_total_glove = get_embed_for_corpus(model_glove, whole_corpus)
embed_whole_skipgram_positive = get_embed_for_corpus(
    model_skipgram_positive, whole_corpus
)
embed_whole_skipgram_negative = get_embed_for_corpus(
    model_skipgram_negative, whole_corpus
)


In [None]:
# Predicted embedding vectors for semantic analogies (capital‚Äìcountry)
# using GloVe embeddings
y_pred_glove_country = []

for i in capital_common_countries:
    # Apply vector arithmetic for analogy reasoning:
    # capital2 ‚àí capital1 + country1 ‚âà country2
    y = (
        embed_capital_glove[i[1]]
        - embed_capital_glove[i[0]]
        + embed_capital_glove[i[2]]
    )
    y_pred_glove_country.append(y)


# Predicted embedding vectors for syntactic analogies (past tense)
# using GloVe embeddings
y_pred_glove_past = []

for i in past_tense:
    # Apply vector arithmetic for verb tense transformation:
    # past_form ‚àí present_form + new_verb ‚âà past_form_of_new_verb
    y = (
        embed_past_tense_glove[i[1]]
        - embed_past_tense_glove[i[0]]
        + embed_past_tense_glove[i[2]]
    )
    y_pred_glove_past.append(y)


In [None]:
# Predicted embedding vectors for semantic analogies (capital‚Äìcountry)
# using Skip-gram with negative sampling embeddings
y_pred_neg_samp_country = []

for i in capital_common_countries:
    # Perform analogy vector arithmetic using negative sampling embeddings:
    # capital2 ‚àí capital1 + country1 ‚âà country2
    y = (
        embed_capital_skipgram_negative[i[1]]
        - embed_capital_skipgram_negative[i[0]]
        + embed_capital_skipgram_negative[i[2]]
    )
    y_pred_neg_samp_country.append(y)


# Predicted embedding vectors for syntactic analogies (past tense)
# using Skip-gram with negative sampling embeddings
y_pred_neg_samp_past = []

for i in past_tense:
    # Apply vector arithmetic to model grammatical tense transformation
    # using embeddings learned via negative sampling
    y = (
        embed_past_tense_skipgram_negative[i[0]]
        - embed_past_tense_skipgram_negative[i[0]]
        + embed_past_tense_skipgram_negative[i[2]]
    )
    y_pred_neg_samp_past.append(y)


In [None]:
# Predicted embedding vectors for semantic analogies (capital‚Äìcountry)
# using the standard Skip-gram model trained with full softmax
y_pred_positive_samp_country = []

for i in capital_common_countries:
    # Apply analogy vector arithmetic:
    # capital2 ‚àí capital1 + country1 ‚âà country2
    y = (
        embed_capital_skipgram_positive[i[1]]
        - embed_capital_skipgram_positive[i[0]]
        + embed_capital_skipgram_positive[i[2]]
    )
    y_pred_positive_samp_country.append(y)


# Predicted embedding vectors for syntactic analogies (past tense)
# using the standard Skip-gram model
y_pred_positive_past_tense = []

for i in past_tense:
    # Apply vector arithmetic to capture verb tense transformations
    # learned by the Skip-gram model
    y = (
        embed_past_tense_skipgram_positive[i[1]]
        - embed_past_tense_skipgram_positive[i[0]]
        + embed_past_tense_skipgram_positive[i[2]]
    )
    y_pred_positive_past_tense.append(y)


In [50]:
# find the cosine similarity
# more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [None]:
def find_max_cosine_words(y_pred, embeddings):
    """
    For each predicted analogy vector, identify the word whose embedding
    has the highest cosine similarity to that vector.
    """
    # List to store the most similar word for each predicted vector
    max_cosine_words = []

    # Iterate over each predicted embedding vector
    for j in range(len(y_pred)):
        # Initialize maximum similarity score for the current prediction
        max_cosine = -1
        max_cosine_word = ""

        # Compare the predicted vector against all word embeddings
        for i in embeddings.keys():
            cosine_temp = cosine_similarity(y_pred[j], embeddings[i])

            # Update the best match if a higher similarity is found
            if cosine_temp > max_cosine:
                max_cosine_word = i
                max_cosine = cosine_temp

        # Store the word with the highest cosine similarity
        max_cosine_words.append(max_cosine_word)

    return max_cosine_words


In [None]:
# Retrieve predicted words for semantic analogies (capital‚Äìcountry)
# using Skip-gram with negative sampling
cosine_neg_samp_syntatical = find_max_cosine_words(
    y_pred_neg_samp_country, embed_capital_skipgram_negative
)

# Retrieve predicted words for semantic analogies
# using standard Skip-gram embeddings
cosine_positive_samp_syntatical = find_max_cosine_words(
    y_pred_positive_samp_country, embed_capital_skipgram_positive
)

# Retrieve predicted words for semantic analogies
# using GloVe embeddings
cosine_glove_syntatical = find_max_cosine_words(
    y_pred_glove_country, embed_capital_glove
)


In [None]:
from heapq import nlargest

def find_next_10_cosine_words_for_word(target_word, embeddings, top_n=10):
    """
    Given a user-specified query word, retrieve the top-N most similar words
    based on cosine similarity in the embedding space.
    """
    # Handle the case where the query word does not exist in the vocabulary
    if target_word not in embeddings:
        return ["Word not in Corpus"]

    # Retrieve the embedding vector corresponding to the query word
    target_vector = embeddings[target_word]

    # Compute cosine similarity between the query vector and all vocabulary embeddings
    cosine_similarities = [
        (word, cosine_similarity(target_vector, embeddings[word]))
        for word in embeddings.keys()
    ]

    # Select the top (N + 1) most similar words to account for the query word itself
    top_n_words = nlargest(top_n + 1, cosine_similarities, key=lambda x: x[1])

    # Remove the query word from the results so it is not returned as its own neighbor
    top_n_words = [word for word, _ in top_n_words if word != target_word]

    # Return only the top-N nearest neighbors
    return top_n_words[:10]


Next 10 similar words for user-provided word 'greece': ['ran', 'decrease', 'complete', 'michigan', 'possible', 'wife', 'quiet', 'saying', 'impossible', 'irish']


In [None]:
# Example query word provided by the user
user_target_word = 'greece'

# Retrieve the top 10 most similar words using Skip-gram (negative sampling) embeddings
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(
    user_target_word, embed_whole_skipgram_negative, top_n=10
)

# Display results or notify the user if the word is not in the vocabulary
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("Word not in Corpus")
else:
    print(
        f"Next 10 similar words for user-provided word "
        f"'{user_target_word}': {next_10_cosine_for_user_word}"
    )

## 10. Accuracy

### 10.1 Semantic Accuracy

In [None]:
def calculate_accuracy(predictions, true_words):
    """
    Compute accuracy by measuring how many predicted words
    match the expected target words.
    """
    # Total number of analogy predictions evaluated
    total_trials = len(predictions)

    # Count how many predicted words appear in the set of true target words
    total_correct = sum(
        1 for pred_word in predictions if pred_word in true_words
    )

    # Calculate accuracy as a percentage
    accuracy = (total_correct / total_trials) * 100

    return accuracy


In [None]:
# Compute semantic analogy accuracy for Skip-gram with negative sampling
semantic_accuracy_neg_samp = calculate_accuracy(
    find_max_cosine_words(
        y_pred_neg_samp_country, embed_whole_skipgram_negative
    ),
    [true_word[3] for true_word in capital_common_countries]
)

# Compute semantic analogy accuracy for standard Skip-gram
semantic_accuracy_pos_samp = calculate_accuracy(
    find_max_cosine_words(
        y_pred_positive_samp_country, embed_whole_skipgram_positive
    ),
    [true_word[3] for true_word in capital_common_countries]
)

# Compute semantic analogy accuracy for GloVe
semantic_accuracy_glove = calculate_accuracy(
    find_max_cosine_words(
        y_pred_glove_country, embed_total_glove
    ),
    [true_word[3] for true_word in capital_common_countries]
)

# Display semantic analogy accuracy for each model
print(
    "Semantic Accuracy of Skipgram Negative: {:.10f}%"
    .format(semantic_accuracy_neg_samp)
)
print(
    "Semantic Accuracy of Skipgram Positive: {:.10f}%"
    .format(semantic_accuracy_pos_samp)
)
print(
    "Semantic Accuracy of Glove: {:.10f}%"
    .format(semantic_accuracy_glove)
)


Semantic Accuracy of Skipgram Negative: 14.4268774704%
Semantic Accuracy of Skipgram Positive: 14.4268774704%
Semantic Accuracy of Glove: 14.2292490119%


In [54]:
input_file_path = 'word-test.v1.txt'
output_file_path = 'word-test-without-first-line.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Write all lines except the first line to the output file
    output_file.writelines(lines[1:])

print(f"First line removed and content saved to: {output_file_path}")


First line removed and content saved to: word-test-without-first-line.txt


In [55]:
input_file_path = 'word-test.v1.txt'
output_file_path = 'capital.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether to start writing lines
    start_writing = False

    # Iterate through lines
    for line in lines:
        # Check if the line starts with ': gram7-past-tense'
        if line.startswith(': capital-common-countries'):
            # Set the flag to start writing
            start_writing = True
        elif line.startswith(':'):
            # If a new section header is encountered, stop writing
            start_writing = False

        # Write lines to the output file if the flag is True
        if start_writing:
            output_file.write(line)

print(f"Lines starting with ': capital-countries' saved to: {output_file_path}")


Lines starting with ': capital-countries' saved to: capital.txt


In [60]:
analogy_score_sem = model_gensim.evaluate_word_analogies(datapath(os.path.abspath('capital.txt')))
print("Semtatical Accuracy of Model Gensim:", analogy_score_sem[0])

Semtatical Accuracy of Model Gensim: 0.9387351778656127


### 10. Syntatical Accuracy

In [None]:
def calculate_accuracy(predictions, true_words):
    """
    Measure syntactic analogy accuracy by comparing predicted words
    against the expected grammatical targets.
    """
    # Total number of syntactic analogy predictions evaluated
    total_trials = len(predictions)

    # Count predictions that correctly match the true past-tense forms
    total_correct = sum(
        1 for pred_word in predictions if pred_word in true_words
    )

    # Compute accuracy as a percentage of correct predictions
    accuracy = (total_correct / total_trials) * 100

    return accuracy


In [None]:
# Compute syntactic analogy accuracy for Skip-gram with negative sampling
syntatical_accuracy_neg_samp = calculate_accuracy(
    find_max_cosine_words(
        y_pred_neg_samp_past, embed_whole_skipgram_negative
    ),
    [true_word[3] for true_word in past_tense]
)

# Compute syntactic analogy accuracy for standard Skip-gram
syntatical_accuracy_pos_samp = calculate_accuracy(
    find_max_cosine_words(
        y_pred_positive_past_tense, embed_whole_skipgram_positive
    ),
    [true_word[3] for true_word in past_tense]
)

# Compute syntactic analogy accuracy for GloVe
syntatical_accuracy_glove = calculate_accuracy(
    find_max_cosine_words(
        y_pred_glove_past, embed_total_glove
    ),
    [true_word[3] for true_word in past_tense]
)

# Display syntactic analogy accuracy results for each model
print(
    "Syntactic Accuracy of Skipgram Negative: {:.2f}%"
    .format(syntatical_accuracy_neg_samp)
)
print(
    "Syntactic Accuracy of Skipgram Positive: {:.2f}%"
    .format(syntatical_accuracy_pos_samp)
)
print(
    "Syntactic Accuracy of Glove: {:.2f}%"
    .format(syntatical_accuracy_glove)
)


Syntatical Accuracy of Skipgram Negative: 0.00%
Syntatical Accuracy of Skipgram Positive: 12.95%
Syntatical Accuracy of Glove: 10.90%


In [62]:
input_file_path = 'word-test.v1.txt'
output_file_path = 'past_tense_lines.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether to start writing lines
    start_writing = False

    # Iterate through lines
    for line in lines:
        # Check if the line starts with ': gram7-past-tense'
        if line.startswith(': gram7-past-tense'):
            # Set the flag to start writing
            start_writing = True
        elif line.startswith(':'):
            # If a new section header is encountered, stop writing
            start_writing = False

        # Write lines to the output file if the flag is True
        if start_writing:
            output_file.write(line)

print(f"Lines starting with ': gram7-past-tense' saved to: {output_file_path}")


Lines starting with ': gram7-past-tense' saved to: past_tense_lines.txt


In [64]:
analogy_score_syn = model_gensim.evaluate_word_analogies(datapath(os.path.abspath('past_tense_lines.txt')))
print("Syntatical Accuracy of Model Gensim:", analogy_score_syn[0])

Syntatical Accuracy of Model Gensim: 0.5544871794871795


| Model               | Window Size | Training Loss (final) | Training Time (s) | Syntactic Accuracy | Semantic Accuracy |
|---------------------|------------:|----------------------:|------------------:|-------------------:|------------------:|
| Skipgram            | 2           | 8.68                  | 1.46              | 12.95%             | 14.43%            |
| Skipgram (NEG)      | 2           | 3634.09               | 1.34              | 0.00%              | 14.43%            |
| GloVe               | 2           | 0.03                  | 6.59              | 10.90%             | 14.23%            |
| GloVe (Gensim)      | -           | -                     | -                 | 55.45%             | 93.87%            |


## 11. Spearman Metric

In [80]:
import pandas as pd

file_path = 'wordsim_similarity_goldstandard.txt'

# Define the column names
columns = ['word_1', 'word_2', 'similarity_index']

# Read the text file into a pandas DataFrame with specified column names
df = pd.read_csv(file_path, sep=',', header=None, names=columns)

df


Unnamed: 0,word_1,word_2,similarity_index
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


In [66]:
get_embed(model_skipgram_negative,'<UNK>')

(-0.1291169822216034, -0.04528377950191498)

In [None]:
# Iterate over each word pair in the similarity dataset
for index, row in df.iterrows():
    word_1 = row['word_1']
    word_2 = row['word_2']

    try:
        # Retrieve embeddings for both words from each trained model
        # to compute similarity scores in the corresponding embedding spaces
        embed_1_neg_samp = get_embed(model_skipgram_negative, word_1)
        embed_2_neg_samp = get_embed(model_skipgram_negative, word_2)

        embed_1_pos_samp = get_embed(model_skipgram_positive, word_1)
        embed_2_pos_samp = get_embed(model_skipgram_positive, word_2)

        embed_1_glove = get_embed(model_glove, word_1)
        embed_2_glove = get_embed(model_glove, word_2)

    except KeyError:
        # Handle out-of-vocabulary words by substituting
        # the embedding of a generic '<UNK>' token
        embed_1_neg_samp = get_embed(model_skipgram_negative, '<UNK>')
        embed_2_neg_samp = get_embed(model_skipgram_negative, '<UNK>')

        embed_1_pos_samp = get_embed(model_skipgram_positive, '<UNK>')
        embed_2_pos_samp = get_embed(model_skipgram_positive, '<UNK>')

        embed_1_glove = get_embed(model_glove, '<UNK>')
        embed_2_glove = get_embed(model_glove, '<UNK>')

    # Compute dot product similarity scores for each model
    # and store them in the DataFrame for later correlation analysis
    df.at[index, 'dot_product_neg_samp'] = np.dot(
        embed_1_neg_samp, embed_2_neg_samp
    )
    df.at[index, 'dot_product_pos_samp'] = np.dot(
        embed_1_pos_samp, embed_2_pos_samp
    )
    df.at[index, 'dot_product_glove'] = np.dot(
        embed_1_glove, embed_2_glove
    )

# Preview the first few rows of the updated DataFrame
df[:10]


Unnamed: 0,word_1,word_2,similarity_index,gensim_dot_product,dot_product_neg_samp,dot_product_pos_samp,dot_product_glove
0,love,sex,6.77,19.414139,0.018722,0.215767,0.068412
1,tiger,cat,7.35,15.629377,0.018722,0.215767,0.068412
2,tiger,tiger,10.0,32.800144,0.018722,0.215767,0.068412
3,book,paper,7.46,20.155933,0.018722,0.215767,0.068412
4,computer,keyboard,7.62,20.919716,0.018722,0.215767,0.068412
5,computer,internet,7.58,30.182514,0.018722,0.215767,0.068412
6,plane,car,5.77,24.047298,0.018722,0.215767,0.068412
7,train,car,6.31,25.472925,0.018722,0.215767,0.068412
8,telephone,communication,7.5,20.172132,0.018722,0.215767,0.068412
9,television,radio,6.77,34.689987,-0.184058,-0.277703,-0.276548


In [None]:
from scipy.stats import spearmanr

# Extract human similarity scores from the WordSim dataset
wordsim_sim = df['similarity_index'].to_numpy()

# Compute dot product similarity scores using a pre-trained GloVe model (Gensim)
# for comparison against models trained from scratch
df['gensim_dot_product'] = df.apply(
    lambda row: np.dot(
        model_gensim[row['word_1'].lower()],
        model_gensim[row['word_2'].lower()]
    ),
    axis=1
)

# Convert Gensim similarity scores to a NumPy array
gensim_sim = df['gensim_dot_product'].to_numpy()

# Compute Spearman rank correlation between human similarity judgments
# and similarity scores produced by each embedding model
correlation_neg, _ = spearmanr(
    df['similarity_index'], df['dot_product_neg_samp']
)
correlation_pos, _ = spearmanr(
    df['similarity_index'], df['dot_product_pos_samp']
)
correlation_glove, _ = spearmanr(
    df['similarity_index'], df['dot_product_glove']
)
correlation_ginsim, _ = spearmanr(
    wordsim_sim, gensim_sim
)

# Display Spearman correlation coefficients for each model
print(
    f"Spearman Correlation Coefficient of Skipgram Negative: "
    f"{correlation_neg:.4f}"
)
print(
    f"Spearman Correlation Coefficient of Skipgram Positive: "
    f"{correlation_pos:.4f}"
)
print(
    f"Spearman Correlation Coefficient of Glove: "
    f"{correlation_glove:.4f}"
)
print(
    f"Spearman Correlation Coefficient of Glove (Gensim): "
    f"{correlation_ginsim:.4f}"
)


Spearman Correlation Coefficient of Skipgram Negative: -0.0028
Spearman Correlation Coefficient of Skipgram Positive: 0.0062
Spearman Correlation Coefficient of Glove: -0.0045
Spearman Correlation Coefficient of Glove(Gensim): 0.4924


In [93]:
# Finding y_true based on the mean of similarity index in the df
y_true = df['similarity_index'].mean()

print(f"y_true: {y_true:.2f}")


y_true: 5.86


In [None]:
# Extract embeddings for the complete vocabulary using the GloVe model
# to support similarity and correlation-based evaluation
embed_whole_glove = get_embed_for_corpus(model_glove, vocabs)

# Extract embeddings for the complete vocabulary using Skip-gram
# trained with negative sampling
embed_whole_neg_skg = get_embed_for_corpus(
    model_skipgram_negative, vocabs
)

# Extract embeddings for the complete vocabulary using the standard Skip-gram model
embed_whole_pos_skg = get_embed_for_corpus(
    model_skipgram_positive, vocabs
)


| Model | Skipgram | NEG | GloVe | GloVe (gensim) | Y_true |
|------|----------:|----:|------:|--------------:|-------:|
| MSE  | 0.0062     | -0.0028 | -0.0045 | 0.4924 | 5.86 |


## 12. Inference

In [106]:
import pickle

# Save the corpus to a pickle file
with open('model/corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)

print("Corpus saved to model/corpus.pkl")

Corpus saved to model/corpus.pkl


In [96]:
import pickle

# Assuming you have a Gensim Word2Vec model named 'model'
# You can replace 'Word2Vec' with the specific Gensim model you are using

# Save the Gensim model to a file using pickle
gensim_model_path = 'model/model_gensim.pkl'

with open(gensim_model_path, 'wb') as model_file:
    pickle.dump(model_gensim, model_file)

print(f"Gensim model saved to: {gensim_model_path}")


Gensim model saved to: model/model_gensim.pkl


In [97]:
# Specify the path to your pickled Gensim model file
gensim_model_path = 'model/model_gensim.pkl'

# Load the Gensim model from the pickle file
with open(gensim_model_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [98]:
for i in range (1,10):
    print(loaded_model.most_similar('language')[i][0])


word
spoken
arabic
english
dialect
vocabulary
text
translation
words


In [99]:
import pickle

# Assuming embed_capital_skipgram is your embedding dictionary
embedding_dict = embed_whole_pos_skg

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_skipgram.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_skipgram.pkl


In [100]:
import pickle

# Assuming embed_capital_skipgram_negative is your embedding dictionary
embedding_dict = embed_whole_neg_skg

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_skipgram_negative.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_skipgram_negative.pkl


In [101]:
import pickle

# Assuming embed_capital_skipgram_negative is your embedding dictionary
embedding_dict = embed_whole_glove

# Specify the file path to save the pickle file
pickle_file_path = 'model/embed_glove.pkl'

# Open the file in binary write mode and dump the dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary saved to: {pickle_file_path}")


Embedding dictionary saved to: model/embed_glove.pkl


In [102]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_skipgram.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_pos = pickle.load(pickle_file)

In [103]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_skipgram_negative.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_neg = pickle.load(pickle_file)

In [104]:
import pickle

# Specify the path to the pickled file on the server
pickle_file_path = 'model/embed_glove.pkl'

# Load the embedding dictionary from the pickled file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_glove = pickle.load(pickle_file)


In [105]:
user_target_word = "run"
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(user_target_word, embedding_dict_glove, top_n=10)

# Print the results
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("Word not in Corpus")
else:
    print(f"Next 10 similar words for user-provided word '{user_target_word}': {next_10_cosine_for_user_word}")

Next 10 similar words for user-provided word 'run': ['permitting', 'decried', 'preferably', 'practices', 'lao', '$37', 'every', 'mack', 'votes', 'rather']
