# Word2Vec (Negative Sampling)

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [4]:
np.__version__, torch.__version__

('1.26.4', '2.5.1+cu121')

In [5]:
import matplotlib
matplotlib.__version__

'3.10.0'

In [37]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
#import os
import os

os.chdir('/content/drive/MyDrive/_NLP/NLP-A1-That-s-What-I-LIKE-st125553')

## 1. Load data

In [6]:
# Load nltk
import nltk

# download news category dataset from nltk
nltk.download('brown') # download brown corpus
nltk.download('punkt') # download punkt for tokenization
nltk.download('punkt_tab') # download punkt_tab for tokenization

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
#1. tokenization
# import the news category dataset
from nltk.corpus import brown
corpus = brown.sents(categories='news')
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [8]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

print(f"before vocabs_len: {len(vocabs)}")
vocabs.append('<UNK>')
print(f"after vocabs_len: {len(vocabs)}")

before vocabs_len: 14394
after vocabs_len: 14395


In [9]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['<UNK>']

14394

In [10]:
# vocabs.append('<UNK>')
# word2index['<UNK>'] = 6

In [11]:
index2word = {v:k for k, v in word2index.items()}
index2word[14394]

'<UNK>'

## 2. Prepare train data

In [12]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(1, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            #outside = (word2index[doc[i-1]], word2index[doc[i+1]])
            outside = [word2index[doc[j]] for j in range(i - window_size, i + window_size + 1) if j != i]

            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2

    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)

    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])

    return np.array(inputs), np.array(labels)

window_size=2
x, y = random_batch(2, corpus, window_size)

In [13]:
x.shape  #batch_size, 1

(2, 1)

In [14]:
x

array([[14080],
       [   68]])

In [15]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [16]:
z = 0.001

In [17]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

100554

In [18]:
vocabs

['successive',
 'Lucille',
 'song',
 '1991',
 'ashes',
 'girls',
 'connotes',
 'Leave',
 'pledge',
 'economic',
 'congealed',
 'Confederacy',
 'continuation',
 'wealth',
 'absorb',
 'bang',
 'batted',
 'even',
 'drove',
 'Seeks',
 'Corcoran',
 'reconvention',
 'liable',
 "taxpayer's",
 'McLemore',
 'function',
 'vanilla',
 'Ocean',
 'zinc',
 'filibuster',
 '58th',
 'invests',
 'divulging',
 'low-wage',
 'seize',
 'Cedric',
 'Jacksonville',
 'pleasant',
 'workshops',
 'commissioner',
 'alert',
 'Chantilly',
 'promise',
 'bicameral',
 'Mercy',
 'earns',
 'abstention',
 'sympathetic',
 'Vic',
 "Kai-shek's",
 '130',
 'touchdown',
 'Stone',
 'outsider',
 'walnut',
 'various',
 'logging',
 'campaigned',
 'contemptuous',
 'moderate-income',
 'playoff',
 'absolute',
 'headboard',
 'toolmaker',
 'co-operation',
 'None',
 'Disapproval',
 'Rome',
 'retaliating',
 'Gen.',
 'excused',
 'Small',
 'eat',
 'scholastic',
 'Mustang',
 'Rickards',
 'moves',
 'Border',
 'recognizes',
 'Barr',
 'luxury',
 

$$P(w)=U(w)^{3/4}/Z$$

In [19]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

Counter(unigram_table)

Counter({'economic': 1,
         'even': 3,
         'Pittsburgh': 1,
         'president': 3,
         'test': 1,
         'indicated': 1,
         'Francisco': 1,
         'man': 4,
         'ruled': 1,
         'into': 6,
         'cent': 3,
         'women': 1,
         'attend': 1,
         'health': 1,
         'estimated': 1,
         'house': 2,
         'chief': 1,
         'personnel': 1,
         'hope': 1,
         'P.': 1,
         'Mantle': 3,
         'by': 18,
         'Congolese': 1,
         'keep': 1,
         "President's": 1,
         'unions': 1,
         'look': 1,
         'basis': 1,
         'Their': 1,
         'business': 2,
         '8': 1,
         'Co.': 2,
         'report': 1,
         'husband': 1,
         'population': 1,
         'final': 1,
         'spring': 1,
         'interested': 1,
         'nations': 1,
         'United': 3,
         'help': 2,
         'rather': 1,
         'Morton': 1,
         'union': 1,
         'according': 1,
        

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [20]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [21]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))

    return torch.cat(neg_samples) #batch_size, k

In [22]:
batch_size = 2
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [23]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [24]:
y_tensor[1]

tensor([9059])

In [25]:
neg_samples[1]

tensor([12290,  4151, 12953,  9905,  9981])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [26]:
class SkipgramNeg(nn.Module):

    def __init__(self, voc_size, emb_size, word2index):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
        self.word2index        = word2index

    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)

        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)

        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)

        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)

        return -torch.mean(loss)

    def get_embed(self, word):
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']

        word = torch.LongTensor([index])

        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2

        return embed[0][0].item(), embed[0][1].item()

In [27]:
#test your model
emb_size = 2
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size, word2index)

In [28]:
loss = model(x_tensor, y_tensor, neg_samples)

In [29]:
loss

tensor(4.3761, grad_fn=<NegBackward0>)

## 5. Training

In [30]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
import time
start_time = time.time()

num_epochs = 100

for epoch in range(num_epochs):
    start = time.time()
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)

    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)

    #backprogate
    optimizer.zero_grad()
    loss.backward()

    #update alpha
    optimizer.step()
    end = time.time()
    epoch_mins, epoch_secs = epoch_time(start, end)

    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1} | Lost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

print(f"Final Loss: {loss:2.6f}")
end_time = time.time()
minutes, seconds = epoch_time(start_time, end_time)
print(f"Total time: {minutes:2.0f} minutes {seconds:2.0f} seconds")

Epoch: 10 | Lost: 1.195970 | time: 0m 0s
Epoch: 20 | Lost: 0.951451 | time: 0m 0s
Epoch: 30 | Lost: 1.052980 | time: 0m 1s
Epoch: 40 | Lost: 1.242271 | time: 0m 1s
Epoch: 50 | Lost: 0.877062 | time: 0m 0s
Epoch: 60 | Lost: 1.733832 | time: 0m 0s
Epoch: 70 | Lost: 1.011738 | time: 0m 0s
Epoch: 80 | Lost: 1.132386 | time: 0m 0s
Epoch: 90 | Lost: 0.733346 | time: 0m 0s
Epoch: 100 | Lost: 1.773133 | time: 0m 0s
Final Loss: 1.773133
Total time:  1 minutes 32 seconds


## 6. Testing

In [33]:
vect = []

for word in vocabs:
    vect.append(model.get_embed(word))
vect = np.array(vect)

In [34]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vect_space, target_vect):
    scores = []
    for each_vect in vect_space:
        each_vect = tuple(each_vect)
        target_vect=tuple(target_vect)
        scores.append(cos_sim(target_vect, each_vect))

    return np.array(scores)

In [35]:
def similarity(model, data):
    words = data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    sim_vect = embed1 - embed0 + embed2

    sim_scores = cos_sim_scores(vect, sim_vect)
    max_score_idx = np.argmax(sim_scores)
    sim_word = index2word[max_score_idx]

    result = False
    if sim_word == words[3]:
        result = True

    return result

### Semantic Test

In [39]:
semantic_file = "data/word-test-semantic.txt"
# open file
with open(semantic_file, "r") as file:
    sem_file = file.readlines()
    #send semantic into vector

semantic = []
for sent in sem_file:
    semantic.append(sent.strip())

#semantic

In [40]:
sem_count = len(semantic)
#sem_total
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [41]:
sem_accuracy = sem_correct / sem_count
print(f"Semantic accuracy: {sem_accuracy:2.2f}")
print(f"Semantic correct: {sem_correct}")
print(f"Semantic count: {sem_count}")

Semantic accuracy: 0.00
Semantic correct: 0
Semantic count: 506


### Syntatic Test

In [42]:
syntatic_file = "data/word-test-syntatic.txt"
# open file
with open(syntatic_file, "r") as file:
    syn_file = file.readlines()

syntatic = []
for sent in syn_file:
    syntatic.append(sent.strip())
#syntatic

In [43]:
syn_count = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [44]:
syn_accuracy = syn_correct / syn_count
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")
print(f"Syntatic correct: {syn_correct}")
print(f"Syntatic count: {syn_count}")

Syntatic accuracy: 0.00
Syntatic correct: 0
Syntatic count: 1560


### Similarity Test


In [45]:
similarity_file = "data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt"
# open file
with open(similarity_file, "r") as file:
    sim_file = file.readlines()

similarity = []
for sent in sim_file:
    similarity.append(sent.strip())
#syntatic

In [46]:
def similarity_test(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    model_result = embed1 @ embed0.T
    sim_result = float(words[2].strip())

    return sim_result, model_result

In [47]:
sim_scores = []
model_scores = []
for sent in similarity:
    sim_result, model_result = similarity_test(model, sent)

    sim_scores.append(sim_result)
    model_scores.append(model_result)

In [48]:
from scipy.stats import spearmanr

corr = spearmanr(sim_scores, model_scores)[0]

print(f"The correlation result is {corr:2.2f}.")

The correlation result is -0.01.


## 7. Saving the model

In [49]:
# Saving the model for testing
torch.save(model.state_dict(), 'app/models/w2v-skipgram-neg.model')

In [50]:
import pickle
neg_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}


In [51]:
import pickle
pickle.dump(neg_args, open('app/models/w2v-skipgram-neg.args', 'wb'))

In [52]:
neg_args = pickle.load(open('app/models/w2v-skipgram-neg.args', 'rb'))
model_neg = SkipgramNeg(**neg_args)
model_neg.load_state_dict(torch.load('app/models/w2v-skipgram-neg.model'))

  model_neg.load_state_dict(torch.load('app/models/w2v-skipgram-neg.model'))


<All keys matched successfully>

In [54]:
# Test the model
model_neg.get_embed('The')

(1.93407142162323, -1.9137628078460693)