# Word2Vec

Let's work on skipgram-based implementation of word2vec.

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import reuters

## 1. Import datasets using nltk 

In [3]:
from nltk.corpus import brown
nltk.download('brown')
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\svrat\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [4]:
corpus = brown.sents(categories='news')
print(corpus[:5])  # Print first 5 sentences from the corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.'], ['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'rep

In [5]:
#get word sequences and unique words
word_sequences = corpus
vocab = set([word for sentence in word_sequences for word in sentence])




In [6]:
#numericalization
word2index = {word: idx for idx, word in enumerate(vocab)}
print(f"{list(word2index.items())[:5]}")

[('baked', 0), ('Bonn', 1), ('Moultons', 2), ('Bermuda', 3), ('$1,000,000,000', 4)]


In [7]:
#vocab size
voc_size = len(vocab)
print(f"Vocab size: {voc_size}")

Vocab size: 14394


In [8]:
#append UNK
vocab.add("<UNK>")

In [9]:
vocab

{'baked',
 'Bonn',
 'Moultons',
 'Bermuda',
 '$1,000,000,000',
 "conference's",
 'murder',
 'menu',
 'Trust',
 'Spangled',
 'drawing',
 '$2,000',
 'fifteenth',
 'promptly',
 'Pentagon',
 'equation',
 'announced',
 'campaign',
 'county-wide',
 'Building',
 'coaches',
 "A.A.U.'s",
 'Unlike',
 'citizens',
 'nationally',
 'Leopoldville',
 'CTA',
 'practices',
 'Chandler',
 'combating',
 'Willie',
 'arrests',
 'Ronald',
 'worth',
 'Harcourt',
 'belt',
 'mature',
 'good-will',
 "Hansen's",
 '16-22',
 'beads',
 'Ray',
 'pack',
 'describes',
 'chairs',
 'alleging',
 'Philharmonique',
 'errors',
 'Issue',
 'direct',
 'policeman',
 'silhouette',
 'Home',
 'respective',
 'Romantic',
 'colossus',
 'retired',
 'mushrooms',
 'Kegham',
 'councils',
 'staffing',
 "Simmons'",
 'haggling',
 'Besset',
 'Chiuchow',
 'recalled',
 'Denials',
 'claims',
 '13-5',
 'suffragettes',
 'Twenty-one-year-old',
 'certain',
 'realistic',
 '22',
 'eleventh',
 'expressing',
 'Guy',
 'bonanza',
 'populated',
 'greater',


In [10]:
word2index['<UNK>'] = 0

In [11]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Prepare train data

In [12]:
for c in corpus:
    print(c)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']
['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports

In [13]:
def random_batch(batch_size, corpus, window_size=2):
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        sent_len = len(sent)
        for i in range(sent_len):
            target = word2index.get(sent[i], word2index['<UNK>'])
            
            #dynamic context window
            start = max(0, i - window_size)
            end = min(sent_len, i + window_size + 1)  
            
            for j in range(start, end):
                if j != i:
                    context_word = word2index.get(sent[j], word2index['<UNK>'])
                    skip_grams.append([target, context_word])       
                                  

    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

### Testing the method

In [14]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[8424]
 [7012]]
Target:  [[5159]
 [6854]]


## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [15]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

## 4. Training

In [16]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Skipgram(voc_size, embedding_size)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([2, 14395])

In [18]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
import time


# Training
num_epochs = 5000
for epoch in range(num_epochs):
    
    start = time.time()
    # gradually increase window size
    if epoch < 2000:
        window_size = 2
    elif epoch < 4000:
        window_size = 3
    else:
        window_size = 4
    
  

    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 9.372866 | time: 0m 0s
Epoch: 2000 | cost: 10.239733 | time: 0m 0s
Epoch: 3000 | cost: 9.579178 | time: 0m 0s
Epoch: 4000 | cost: 9.826647 | time: 0m 0s
Epoch: 5000 | cost: 9.858641 | time: 0m 0s


## 5. Plotting the embeddings

In [28]:
#list of vocabs
vocab =  list(vocab)
vocab[:50]

['baked',
 'Bonn',
 'Moultons',
 'Bermuda',
 '$1,000,000,000',
 "conference's",
 'murder',
 'menu',
 'Trust',
 'Spangled',
 'drawing',
 '$2,000',
 'fifteenth',
 'promptly',
 'Pentagon',
 'equation',
 'announced',
 'campaign',
 'county-wide',
 'Building',
 'coaches',
 "A.A.U.'s",
 'Unlike',
 'citizens',
 'nationally',
 'Leopoldville',
 'CTA',
 'practices',
 'Chandler',
 'combating',
 'Willie',
 'arrests',
 'Ronald',
 'worth',
 'Harcourt',
 'belt',
 'mature',
 'good-will',
 "Hansen's",
 '16-22',
 'beads',
 'Ray',
 'pack',
 'describes',
 'chairs',
 'alleging',
 'Philharmonique',
 'errors',
 'Issue',
 'direct']

In [29]:
word = vocab[0]

In [30]:
#numericalization
id = word2index[word]
id

0

In [31]:
id_tensor = torch.LongTensor([id])
id_tensor

tensor([0])

In [32]:
#get the embedding by averaging
v_embed = model.embedding_v(id_tensor)
u_embed = model.embedding_u(id_tensor)

v_embed, u_embed

(tensor([[-0.5811, -1.3068]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.1344,  1.3527]], grad_fn=<EmbeddingBackward0>))

In [33]:
#average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed[0][1]

tensor(0.0230, grad_fn=<SelectBackward0>)

In [34]:
#let's write a function to get embedding given a word
def get_embed(word):
    idx = word2index.get(word, word2index["<UNK>"])
    id_tensor = torch.LongTensor([word2index[idx]])
    
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor) 
    
    word_embed = (v_embed + u_embed) / 2 
    return word_embed.detach().numpy().flatten()

In [35]:
def plot_embeddings_pca(words, get_embed, title="Word2Vec (PCA 2D)"):
    # 1) collect vectors
    X = np.vstack([get_embed(w) for w in words])   # shape: [n_words, emb_dim]

    # 2) PCA to 2D (no sklearn needed)
    Xc = X - X.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    X2 = Xc @ Vt[:2].T   # [n_words, 2]

    # 3) plot
    plt.figure(figsize=(10, 5))
    plt.scatter(X2[:, 0], X2[:, 1])
    for i, w in enumerate(words):
        plt.annotate(w, (X2[i, 0], X2[i, 1]), xytext=(5, 2), textcoords="offset points")
    plt.title(title)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.show()

# Example: plot first 30 words in your vocab list
plot_words = vocab[:30]   # vocab should be a list of tokens
plot_embeddings_pca(plot_words, get_embed)


KeyError: 0