# Word2Vec (Skipgram )

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load data

In [2]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('brown')


[nltk_data] Downloading package brown to
[nltk_data]     /Users/soehtetnaing/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
from nltk.corpus import brown

brown.categories()


['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [5]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [6]:

flattened_list = flatten(corpus)
flattened_list

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.',
 'The',
 'jury',
 'further',
 'said',
 'in',
 'term-end',
 'presentments',
 'that',
 'the',
 'City',
 'Executive',
 'Committee',
 ',',
 'which',
 'had',
 'over-all',
 'charge',
 'of',
 'the',
 'election',
 ',',
 '``',
 'deserves',
 'the',
 'praise',
 'and',
 'thanks',
 'of',
 'the',
 'City',
 'of',
 'Atlanta',
 "''",
 'for',
 'the',
 'manner',
 'in',
 'which',
 'the',
 'election',
 'was',
 'conducted',
 '.',
 'The',
 'September-October',
 'term',
 'jury',
 'had',
 'been',
 'charged',
 'by',
 'Fulton',
 'Superior',
 'Court',
 'Judge',
 'Durwood',
 'Pye',
 'to',
 'investigate',
 'reports',
 'of',
 'possible',
 '``',
 'irregularities',
 "''",
 'in',
 'the',
 'hard-fought',
 'primary',
 'which',
 'was',
 'won',
 'by',
 'Mayor-nominate'

In [7]:
len(vocabs)

14394

In [8]:
vocabs.append('<UNK>')


In [9]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['Country']

9712

In [10]:
len(vocabs)

14395

In [11]:
index2word = {v:k for k, v in word2index.items()}
index2word[14394]

'<UNK>'

## 2. Prepare train data

In [12]:
window_size = 2
batch_size = 2
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, wdsize = 2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(wdsize, len(doc)-wdsize):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-wdsize]], word2index[doc[i+wdsize]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(batch_size, corpus, window_size)

In [13]:
x, y

(array([[  416],
        [10990]]),
 array([[ 8361],
        [10369]]))

In [14]:
x.shape  #batch_size, 1

(2, 1)

In [15]:
y.shape  #batch_size 1

(2, 1)

## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [16]:
vocab_size = len(vocabs)
vocab_size

14395

In [17]:
dim_size = 2

In [18]:
embedding = nn.Embedding(vocab_size, dim_size)

In [19]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [20]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.word2index = word2index
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
    
    def get_embed(self, word):
        
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()
        

In [21]:
#prepare all vocabs

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14392, 14393, 14394],
        [    0,     1,     2,  ..., 14392, 14393, 14394]])

In [22]:
model = Skipgram(vocab_size, dim_size, word2index)
model

Skipgram(
  (embedding_center): Embedding(14395, 2)
  (embedding_outside): Embedding(14395, 2)
)

In [23]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [24]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [25]:
loss

tensor(10.9757, grad_fn=<NegBackward0>)

## 4. Training

In [26]:
batch_size = 2
dim_size   = 2
window_size = 2
model      = Skipgram(vocab_size, dim_size, word2index)
optimizer  = optim.Adam(model.parameters(), lr=0.001)
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

In [27]:
import time

num_epochs = 100

time_start = time.time()
for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:5.0f} | Loss: {loss:5.2f}")

# Measure total training time and print results
training_time_total = time.time() - time_start

# Calculate minutes and seconds for readability
minutes, seconds = divmod(int(training_time_total), 60)

# Print summary
print(f"\nTraining Complete:\nTotal Loss: {loss:.2f} | Time Taken: {minutes} minutes and {seconds} seconds")


Epoch    10 | Loss: 12.82
Epoch    20 | Loss: 11.62
Epoch    30 | Loss: 11.30
Epoch    40 | Loss: 10.93
Epoch    50 | Loss:  9.70
Epoch    60 | Loss: 11.04
Epoch    70 | Loss: 12.62
Epoch    80 | Loss:  9.82
Epoch    90 | Loss: 10.78
Epoch   100 | Loss: 10.73

Training Complete:
Total Loss: 10.73 | Time Taken: 0 minutes and 42 seconds


## 5. Testing

In [30]:
file_paths = {
    "semantic": "../word-testsemantic.v1.txt",
    "syntatic": "../word-testsyntatic.v1.txt"
}

def load_tests(file_path):
    with open(file_path, "r") as file:
        return [sent.strip() for sent in file]

# Load tests from files
semantic = load_tests(file_paths["semantic"])
syntatic = load_tests(file_paths["syntatic"])

In [31]:
semantic, syntatic

(['Athens Greece Baghdad Iraq',
  'Athens Greece Bangkok Thailand',
  'Athens Greece Beijing China',
  'Athens Greece Berlin Germany',
  'Athens Greece Bern Switzerland',
  'Athens Greece Cairo Egypt',
  'Athens Greece Canberra Australia',
  'Athens Greece Hanoi Vietnam',
  'Athens Greece Havana Cuba',
  'Athens Greece Helsinki Finland',
  'Athens Greece Islamabad Pakistan',
  'Athens Greece Kabul Afghanistan',
  'Athens Greece London England',
  'Athens Greece Madrid Spain',
  'Athens Greece Moscow Russia',
  'Athens Greece Oslo Norway',
  'Athens Greece Ottawa Canada',
  'Athens Greece Paris France',
  'Athens Greece Rome Italy',
  'Athens Greece Stockholm Sweden',
  'Athens Greece Tehran Iran',
  'Athens Greece Tokyo Japan',
  'Baghdad Iraq Bangkok Thailand',
  'Baghdad Iraq Beijing China',
  'Baghdad Iraq Berlin Germany',
  'Baghdad Iraq Bern Switzerland',
  'Baghdad Iraq Cairo Egypt',
  'Baghdad Iraq Canberra Australia',
  'Baghdad Iraq Hanoi Vietnam',
  'Baghdad Iraq Havana Cuba'

In [32]:
vector_space = []

for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [33]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [34]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

In [35]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [36]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


In [37]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [38]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


In [40]:
file_path = "../wordsim_similarity_goldstandard.txt"

with open(file_path, 'r') as file:
            content = file.readlines()

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [41]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [42]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [43]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation similarity is {corr:2.2f}.")

Correlation similarity is -0.02.


## 6. Save model


In [44]:
import torch
import pickle


model_save_path = '../models/skipgram.model'
torch.save(model.state_dict(), model_save_path)

model_args = {
    'voc_size': vocab_size,
    'emb_size': dim_size,
    'word2index': word2index,
}

arg_file_path = '../models/skipgram.args'
with open(arg_file_path, 'wb') as f:
    pickle.dump(model_args, f)

In [45]:
with open(arg_file_path, 'rb') as f:
    loaded_args = pickle.load(f)


model_skipgram = Skipgram(**loaded_args)
model_skipgram.load_state_dict(torch.load(model_save_path))


  model_skipgram.load_state_dict(torch.load(model_save_path))


<All keys matched successfully>

In [46]:
model_skipgram.get_embed('country')

(0.7995257377624512, -0.22141851484775543)