In [1]:
import nltk
# nltk.download()

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Define some very simple data for understanding

In [3]:
from nltk.corpus import brown

# Fetch sentences from the 'news' category of the Brown corpus
corpus = brown.sents(categories='news') 

# Convert sentences into a list of words for each sentence
corpus = [[word for word in sentence] for sentence in corpus]

# Print the first sentence as a list of words
print(corpus[0])


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [4]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))

vocab

['desk',
 'walloped',
 'meets',
 'art',
 'B',
 'everybody',
 'windows',
 'threat',
 'Pantas',
 'repaid',
 "Wednesday's",
 'oysters',
 'Anderlini',
 'blast',
 'McCluskey',
 'Sheraton-Biltmore',
 'fumble',
 'difficulty',
 'Brain',
 'nonsense',
 'remains',
 'purposely',
 'grandfather',
 'Producer',
 '$900',
 'appraisers',
 'grant-in-aid',
 'perennial',
 'peoples',
 'threesome',
 'camera',
 'protests',
 'Harlingen',
 'pictured',
 'exclusively',
 'bilateral',
 'imagine',
 'cashed',
 'ballets',
 'Brooklyn',
 'idol',
 '85',
 'pad',
 'celebrating',
 'ominous',
 'enrollment',
 'interviews',
 'Show',
 'dances',
 'Gunther',
 'Worth',
 'bill',
 'training',
 'assessors',
 'pennies',
 '17',
 'strengthened',
 'afford',
 'specialties',
 'escape',
 '637',
 'strategy',
 'under-developed',
 'Brett',
 'Piero',
 'Edna',
 'Deaf',
 "Throneberry's",
 'revenge',
 'Civil',
 'Michael',
 'aboard',
 'realty',
 'monotonous',
 'vitality',
 'convinced',
 'tilts',
 'forthcoming',
 'Initially',
 'sailing',
 'valuable',

In [5]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [6]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14394


In [7]:
# append UNK
vocab.append('<UNK>')


In [8]:
vocab[-1:]

['<UNK>']

In [9]:
word2index['<UNK>'] = 0

In [10]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Prepare train data

In [11]:
for c in corpus:
    print(c)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']
['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports

In [12]:
def random_batch(batch_size, word_sequence):
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1):
            target = word2index[sent[i]]
            context = [word2index[sent[i - 1]], word2index[sent[i + 1]]]
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

In [13]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[13242]
 [ 5136]]
Target:  [[9096]
 [ 491]]


In [None]:
input_batch.shape, target_batch.shape

## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [14]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

## 4. Training

In [15]:
batch_size     = 2 # mini-batch size
embedding_size = 2 # so we can later plot
model          = Skipgram(voc_size, embedding_size)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([2, 14395])

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
import time

# Training
num_epochs = 50
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 10 | cost: 10.066965 | time: 0m 0s
Epoch: 20 | cost: 10.694195 | time: 0m 0s
Epoch: 30 | cost: 9.792861 | time: 0m 0s
Epoch: 40 | cost: 10.402643 | time: 0m 0s
Epoch: 50 | cost: 10.152073 | time: 0m 0s


## 5. Plotting the embeddings

In [19]:
#let's write a function to get embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [20]:
plt.figure(figsize=(6,10))
for i, word in enumerate(vocab[:]): #loop each unique vocab
    x, y = get_embed(word)
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
plt.show()

## 6. Cosine similarity

Formally the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) $s$ between two vectors $p$ and $q$ is defined as:

$$s = \frac{p \cdot q}{||p|| ||q||}, \textrm{ where } s \in [-1, 1] $$ 

If $p$ and $q$ is super similar, the result is 1 otherwise 0.

In [None]:
with open('word-test_semantic.txt', 'r') as file:
    # Read the entire file content
    # dataset = file.read() 
    # print(dataset)

    # Read the file line by line
    # for line in file:
    #     print(line.strip()) 

    # Read all lines into a list
    data_sem = file.readlines()
    print(data_sem)

with open('word-test_syntactic.txt', 'r') as file:
    # Read all lines into a list
    data_syn = file.readlines()
    print(data_syn)

In [None]:
def clean_data(data):
    cleaned_data = []
    for line in data:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace, including \n and \t
        cleaned_data.append(cleaned_line)

    return cleaned_data

# usage:
data_sem = clean_data(data_sem)
print(data_sem)

data_syn = clean_data(data_syn)
print(data_syn)

In [26]:
def parse_analogies(file_data):
    """
    Parse analogy datasets.
    Each analogy should be in the format: word1 word2 word3 word4
    """
    analogies = []
    for line in file_data:
        words = line.split()
        if len(words) == 4:
            analogies.append(tuple(words))
    return analogies

# Parse the datasets
semantic_analogies = parse_analogies(data_sem)
syntactic_analogies = parse_analogies(data_syn)

In [None]:
semantic_analogies

In [None]:
syntactic_analogies

In [27]:
def predict_word(model, word2index, index2word, analogy):
    word1, word2, word3, _ = analogy
    if word1 not in word2index or word2 not in word2index or word3 not in word2index:
        return '<UNK>'  # Return UNK if any word is out of vocabulary

    word1_idx = torch.LongTensor([word2index[word1]])
    word2_idx = torch.LongTensor([word2index[word2]])
    word3_idx = torch.LongTensor([word2index[word3]])

    # Get embeddings
    word1_embed = model.embedding_v(word1_idx)
    word2_embed = model.embedding_v(word2_idx)
    word3_embed = model.embedding_v(word3_idx)

    # Vector math: word2 - word1 + word3
    target_vector = word2_embed - word1_embed + word3_embed

    # Compute cosine similarity with all vocabulary embeddings
    all_embeddings = model.embedding_v.weight.data
    similarities = torch.nn.functional.cosine_similarity(target_vector, all_embeddings, dim=1)

    # Find the index of the most similar word
    predicted_idx = torch.argmax(similarities).item()
    return index2word[predicted_idx]


## 7. Accuracy calculation: semantic and syntactic_accuracy

In [28]:
def calculate_accuracy(analogies, model, word2index, index2word):
    """
    Calculate accuracy for a given set of analogies.
    """
    correct = 0
    total = 0

    for analogy in analogies:
        total += 1
        predicted_word = predict_word(model, word2index, index2word, analogy)
        if predicted_word.lower() == analogy[3].lower():
            correct += 1

    return correct / total if total > 0 else 0

In [None]:
# Calculate syntactic and semantic accuracies
semantic_accuracy = calculate_accuracy(semantic_analogies, model, word2index, index2word)
syntactic_accuracy = calculate_accuracy(syntactic_analogies, model, word2index, index2word)

print(f"Semantic Accuracy: {semantic_accuracy * 100:.2f}%")
print(f"Syntactic Accuracy: {syntactic_accuracy * 100:.2f}%")

## 8. Correlation

In [34]:
from scipy.stats import spearmanr
import torch.nn.functional as F
import numpy as np

def parse_similarity_file(filepath):
    word_pairs = []
    human_scores = []

    with open(filepath, 'r') as file:
        for line in file:
            words = line.split()
            if len(words) == 3:
                word1, word2, score = words
                word_pairs.append((word1, word2))
                human_scores.append(float(score))
    
    return word_pairs, human_scores

In [None]:

def compute_model_similarity(model, word2index, word_pairs):
    similarities = []
    for word1, word2 in word_pairs:
        if word1 in word2index and word2 in word2index:
            word1_idx = torch.LongTensor([word2index[word1]])
            word2_idx = torch.LongTensor([word2index[word2]])
            
            word1_embed = model.embedding_v(word1_idx)
            word2_embed = model.embedding_v(word2_idx)
            
            # Cosine similarity
            sim = F.cosine_similarity(word1_embed, word2_embed, dim=1).item()
            similarities.append(sim)
        else:
            similarities.append(0.0)  # Assign 0 if either word is out of vocabulary
    
    return similarities


In [41]:
def calculate_metrics(human_scores, model_scores):
    # Spearman correlation
    spearman_corr, _ = spearmanr(human_scores, model_scores)
    
    # Mean Squared Error
    mse = np.mean((np.array(human_scores) - np.array(model_scores)) ** 2)
    
    return spearman_corr, mse

In [None]:
# usage
all_spearman_corrs = []
all_mses = []

filepaths = [
    'wordsim_relatedness_goldstandard.txt',
    'wordsim_similarity_goldstandard.txt',
    'wordsim353_agreed.txt',
    'wordsim353_annotator1.txt',
    'wordsim353_annotator2.txt',
]

for filepath in filepaths:
    # Parse the similarity dataset
    word_pairs, human_scores = parse_similarity_file(filepath)
    
    # Compute model similarities
    model_scores = compute_model_similarity(model, word2index, word_pairs)
    
    # Calculate metrics
    spearman_corr, mse = calculate_metrics(human_scores, model_scores)
    all_spearman_corrs.append(spearman_corr)
    all_mses.append(mse)
    
    print(f"File: {filepath}")
    print(f"Spearman Correlation: {spearman_corr:.4f}")
    print(f"MSE: {mse:.4f}\n")



In [None]:
# Average metrics across all files
avg_spearman_corr = np.mean(all_spearman_corrs)
avg_mse = np.mean(all_mses)

print(f"Average Spearman Correlation: {avg_spearman_corr:.4f}")
print(f"Average MSE: {avg_mse:.4f}")