# Word2Vec (Negative Sampling)

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [3]:
np.__version__, torch.__version__

('1.26.4', '2.4.1')

In [4]:
import matplotlib
matplotlib.__version__

'3.9.2'

## 1. Load data

In [5]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('brown')


[nltk_data] Downloading package brown to
[nltk_data]     /Users/soehtetnaing/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [6]:
from nltk.corpus import brown

corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [7]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [8]:
len(vocabs)

14394

In [9]:
vocabs.append('<UNK>')


In [10]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['Country']

9361

In [11]:
len(vocabs)

14395

In [12]:
index2word = {v:k for k, v in word2index.items()}
index2word[14394]

'<UNK>'

## 2. Prepare train data

In [13]:
window_size = 2
batch_size = 2
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, wdsize = 2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(wdsize, len(doc)-wdsize):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-wdsize]], word2index[doc[i+wdsize]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(batch_size, corpus, window_size)

In [14]:
x, y

(array([[5006],
        [5745]]),
 array([[ 729],
        [9334]]))

In [15]:
x.shape  #batch_size, 1

(2, 1)

In [16]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [17]:
z = 0.001

In [18]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

100554

In [19]:
vocabs

['340-blast',
 '$77,389,000',
 'Builders',
 'Hinsdale',
 'telephoned',
 'beaming',
 'To',
 'Picasso',
 'Hicks',
 'chandeliers',
 'professional',
 'bomb',
 'interfering',
 'fits',
 'cautioned',
 '180',
 'Legers',
 'obligation',
 'completion',
 'startlingly',
 'suburban',
 'learning',
 'Crawford',
 'reply',
 'seasons',
 'Kel',
 'five-cent',
 'Hogan',
 'makeshift',
 '$1.5',
 'hotdogs',
 'canvassed',
 'receptive',
 'arrange',
 'Bourguiba',
 'chemistry',
 'recalls',
 'sweat-suits',
 'awaited',
 'building',
 'picketing',
 '6-12',
 'Canada',
 'unless',
 'Only',
 '155-yarder',
 'designs',
 'sue',
 'Ter.',
 'legislature',
 'Princeton',
 'guardians',
 'sprained',
 'Kika',
 'Sam',
 'ways',
 'towards',
 'U-I',
 'battleground',
 'NATO',
 'rejoin',
 'Dawson',
 'Tree',
 'burr',
 'June',
 'O-B',
 'library',
 'mishap',
 'Writers',
 'suburbs',
 'Yankees',
 'Loving',
 "Gore's",
 'Cardinals',
 'make',
 '$39.5',
 'unworkable',
 'functionary',
 'Vientiane',
 'Cerebral',
 "Miller's",
 'operation',
 'First',


$$P(w)=U(w)^{3/4}/Z$$

In [20]:
len(vocabs)

14395

In [21]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 114,
         ',': 108,
         '.': 89,
         'of': 69,
         'to': 55,
         'and': 55,
         'a': 52,
         'in': 50,
         'for': 30,
         'that': 26,
         'The': 26,
         "''": 24,
         'was': 24,
         'is': 24,
         '``': 24,
         'on': 22,
         'at': 21,
         'be': 19,
         'with': 19,
         'by': 18,
         'as': 18,
         'he': 17,
         'will': 15,
         'said': 15,
         'his': 15,
         'from': 14,
         'it': 14,
         ';': 13,
         'are': 13,
         'had': 12,
         '--': 12,
         'an': 12,
         'has': 12,
         'Mrs.': 11,
         'this': 11,
         'were': 11,
         'have': 11,
         'who': 11,
         'not': 11,
         'which': 10,
         'their': 10,
         'would': 10,
         'been': 9,
         'they': 9,
         'He': 9,
         'more': 8,
         'or': 8,
         'I': 8,
         '(': 8,
         'one': 8,
         ')': 8,


## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [22]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [23]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [24]:
batch_size = 2
x, y = random_batch(batch_size, corpus, window_size)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [25]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [26]:
y_tensor[1]

tensor([7415])

In [27]:
neg_samples[1]

tensor([3086, 6914, 3086, 2306, 8440])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [28]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()

        self.word2index = word2index
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

In [29]:
#test your model
dim_size = 2
vocab_size = len(vocabs)
model = SkipgramNeg(vocab_size, dim_size, word2index)

In [30]:
loss = model(x_tensor, y_tensor, neg_samples)

In [31]:
loss

tensor(2.0599, grad_fn=<NegBackward0>)

## 5. Training

In [32]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [33]:
import time

num_epochs = 100

time_start = time.time()
for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

# Measure total training time and print results
training_time_total = time.time() - time_start

# Calculate minutes and seconds for readability
minutes, seconds = divmod(int(training_time_total), 60)

print(f"\nTraining Complete:\nTotal Loss: {loss:2.2f} | Time Taken: {minutes} minutes and {seconds} seconds")

Epoch     10 | Loss: 2.323517
Epoch     20 | Loss: 3.326450
Epoch     30 | Loss: 0.856759
Epoch     40 | Loss: 0.785152
Epoch     50 | Loss: 2.580120
Epoch     60 | Loss: 1.597835
Epoch     70 | Loss: 1.563215
Epoch     80 | Loss: 1.759705
Epoch     90 | Loss: 3.168742
Epoch    100 | Loss: 1.850110

Training Complete:
Total Loss: 1.85 | Time Taken: 0 minutes and 37 seconds


## 6. Testing

In [34]:
file_paths = {
    "semantic": "../word-testsemantic.v1.txt",
    "syntatic": "../word-testsyntatic.v1.txt"
}

def load_tests(file_path):
    with open(file_path, "r") as file:
        return [sent.strip() for sent in file]

# Load tests from files
semantic = load_tests(file_paths["semantic"])
syntatic = load_tests(file_paths["syntatic"])

In [35]:
semantic, syntatic

(['Athens Greece Baghdad Iraq',
  'Athens Greece Bangkok Thailand',
  'Athens Greece Beijing China',
  'Athens Greece Berlin Germany',
  'Athens Greece Bern Switzerland',
  'Athens Greece Cairo Egypt',
  'Athens Greece Canberra Australia',
  'Athens Greece Hanoi Vietnam',
  'Athens Greece Havana Cuba',
  'Athens Greece Helsinki Finland',
  'Athens Greece Islamabad Pakistan',
  'Athens Greece Kabul Afghanistan',
  'Athens Greece London England',
  'Athens Greece Madrid Spain',
  'Athens Greece Moscow Russia',
  'Athens Greece Oslo Norway',
  'Athens Greece Ottawa Canada',
  'Athens Greece Paris France',
  'Athens Greece Rome Italy',
  'Athens Greece Stockholm Sweden',
  'Athens Greece Tehran Iran',
  'Athens Greece Tokyo Japan',
  'Baghdad Iraq Bangkok Thailand',
  'Baghdad Iraq Beijing China',
  'Baghdad Iraq Berlin Germany',
  'Baghdad Iraq Bern Switzerland',
  'Baghdad Iraq Cairo Egypt',
  'Baghdad Iraq Canberra Australia',
  'Baghdad Iraq Hanoi Vietnam',
  'Baghdad Iraq Havana Cuba'

In [36]:
vector_space = []

for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [37]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [38]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

In [39]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [40]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


In [41]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [42]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


In [43]:
file_path = "../wordsim_similarity_goldstandard.txt"

with open(file_path, 'r') as file:
            content = file.readlines()

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [44]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [45]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [46]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is 0.09.


## 7. Save model

In [47]:
import torch
import pickle


model_save_path = '../models/skipgramneg.model'
torch.save(model.state_dict(), model_save_path)

model_args = {
    'voc_size': vocab_size,
    'emb_size': dim_size,
    'word2index': word2index,
}

arg_file_path = '../models/skipgramneg.args'
with open(arg_file_path, 'wb') as f:
    pickle.dump(model_args, f)

In [48]:
with open(arg_file_path, 'rb') as f:
    loaded_args = pickle.load(f)


model_skipgramneg = SkipgramNeg(**loaded_args)
model_skipgramneg.load_state_dict(torch.load(model_save_path))


  model_skipgramneg.load_state_dict(torch.load(model_save_path))


<All keys matched successfully>

In [50]:
model_skipgramneg.get_embed('country')

(-0.7178047895431519, -0.16817757487297058)

In [51]:
model_skipgramneg.get_embed('nation')

(-0.8031865954399109, -0.23456528782844543)