# Word2Vec (Skipgram )

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
np.__version__, torch.__version__

('1.26.4', '2.5.1')

In [3]:
import matplotlib
matplotlib.__version__

'3.9.2'

## 1. Load data

In [4]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/tadasuttaket/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [5]:
from nltk.corpus import brown

brown.categories()

#returns a list of sentences, where each sentence is a list of words from the "news" category of the Brown Corpus
corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [6]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [7]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['dog']

11352

In [8]:
last_word = len(vocabs)

In [9]:
vocabs.append('<UNK>')
word2index['<UNK>'] = last_word

In [10]:
index2word = {v:k for k, v in word2index.items()}
index2word[769]

'electric'

## 2. Prepare train data

In [11]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

    window_size = 4

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 4 words
            # Collect outside words from the window (i-2 to i+2), excluding the center word
            outside = [word2index[doc[j]] for j in range(i - window_size, i + window_size + 1) if j != i]
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2;   center, outside3;   center, outside4
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)

In [12]:
x.shape  #batch_size, 1

(2, 1)

In [13]:
x

array([[ 8797],
       [10999]])

In [14]:
y.shape  #batch_size 1

(2, 1)

In [15]:
y

array([[3288],
       [6362]])

## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [16]:
vocabs_size = len(vocabs)
vocabs_size

14395

In [17]:
embedding = nn.Embedding(vocabs_size, 2)

In [18]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [19]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

        self.word2index = word2index
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()
        

In [20]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14392, 14393, 14394],
        [    0,     1,     2,  ..., 14392, 14393, 14394]])

In [21]:
model = Skipgram(voc_size, 2, word2index)
model

Skipgram(
  (embedding_center): Embedding(14395, 2)
  (embedding_outside): Embedding(14395, 2)
)

In [22]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [23]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [24]:
loss

tensor(9.8228, grad_fn=<NegBackward0>)

## 4. Training

In [25]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size, word2index)
optimizer  = optim.Adam(model.parameters(), lr=0.001)
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
import time

num_epochs = 1000

start = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()

    end = time.time()

    epoch_mins, epoch_secs = epoch_time(start, end)
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch    100 | Loss: 9.996923 | time: 0m 38s
Epoch    200 | Loss: 10.967303 | time: 1m 17s
Epoch    300 | Loss: 10.310737 | time: 1m 56s
Epoch    400 | Loss: 10.675185 | time: 2m 34s
Epoch    500 | Loss: 11.706662 | time: 3m 12s
Epoch    600 | Loss: 9.008938 | time: 3m 50s
Epoch    700 | Loss: 10.603891 | time: 4m 28s
Epoch    800 | Loss: 9.754954 | time: 5m 6s
Epoch    900 | Loss: 8.667796 | time: 5m 43s
Epoch   1000 | Loss: 10.018503 | time: 6m 21s


In [28]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

## 5. Testing

In [29]:
file_path_semantic = "../Test/word-test.v1-capital.txt"
file_path_syntactic = "../Test/word-test.v1-past-tense.txt"

content_semantic = open_file(file_path_semantic)
content_syntactic = open_file(file_path_syntactic)

semantic = []
syntactic = []

for sent in content_semantic:
    semantic.append(sent.strip())

for sent in content_syntactic:
    syntactic.append(sent.strip())

In [30]:
vector_space = []

for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [31]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [32]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

## Semantic accuracy

In [41]:
len(semantic)

506

In [33]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [42]:
print(sem_correct)
print(sem_total)

0
506


In [34]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


## Syntactixc

In [35]:
syn_total = len(syntactic)
syn_correct = 0
for sent in syntactic:
    if similarity(model, sent):
        syn_correct += 1

In [36]:
syn_accuracy = syn_correct / syn_total
print(f"Syntactic accuracy: {syn_accuracy:2.2f}")

Syntactic accuracy: 0.00


## Similarity Accuracy

In [37]:
file_path = "../Test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [38]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [39]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [45]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between models’ dot product and the provided similarity metrics is {corr:2.2f}.")

Correlation between models’ dot product and the provided similarity metrics is 0.10.
