In [1]:
# ====== SETUP (must run before Dataset / models) ======
import re, math, time, random
import numpy as np
import pandas as pd
import time


import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader  # <-- fixes Dataset error

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # <-- fixes device error
print("device:", device)



device: cpu


In [None]:
# download reuters dataset
nltk.download('reuters')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
from nltk.corpus import reuters

doc_ids = reuters.fileids()
docs_raw = [reuters.raw(doc_id) for doc_id in doc_ids]

print("num docs:", len(docs_raw))
print("first doc:\n", docs_raw[0])


num docs: 10788
first doc:
 ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they 

Preprocess function

In [None]:
import re
from nltk.tokenize import word_tokenize, sent_tokenize

def preprocess(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  #remove except letters
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) >= 2]
    return tokens


In [None]:
all_sentences = []

for raw in docs_raw:
    for s in sent_tokenize(raw):
        toks = preprocess(s)
        if len(toks) >= 3:  # keep only sentences with 3 or more tokens
            all_sentences.append(toks)

print("num sentences:", len(all_sentences))
print("sample:", all_sentences[0][:20])


num sentences: 53232
sample: ['asian', 'exporters', 'fear', 'damage', 'from', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of']


In [6]:
avg_len = sum(len(s) for s in all_sentences) / len(all_sentences)
print("avg tokens per sentence:", round(avg_len, 2))

for i in range(3):
    print(i, all_sentences[i])


avg tokens per sentence: 23.96
0 ['asian', 'exporters', 'fear', 'damage', 'from', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'asia', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', 'reaching', 'economic', 'damage', 'businessmen', 'and', 'officials', 'said']
1 ['they', 'told', 'reuter', 'correspondents', 'in', 'asian', 'capitals', 'move', 'against', 'japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'and', 'lead', 'to', 'curbs', 'on', 'american', 'imports', 'of', 'their', 'products']
2 ['but', 'some', 'exporters', 'said', 'that', 'while', 'the', 'conflict', 'would', 'hurt', 'them', 'in', 'the', 'long', 'run', 'in', 'the', 'short', 'term', 'tokyo', 'loss', 'might', 'be', 'their', 'gain']


Step 2: Vocabulary & numericalization

In [7]:
all_sentences[3]  # list of tokenized sentences


['the',
 'has',
 'said',
 'it',
 'will',
 'impose',
 'mln',
 'dlrs',
 'of',
 'tariffs',
 'on',
 'imports',
 'of',
 'japanese',
 'electronics',
 'goods',
 'on',
 'april',
 'in',
 'retaliation',
 'for',
 'japan',
 'alleged',
 'failure',
 'to',
 'stick',
 'to',
 'pact',
 'not',
 'to',
 'sell',
 'semiconductors',
 'on',
 'world',
 'markets',
 'at',
 'below',
 'cost']

Step 2.1: Build vocabulary

In [8]:
from collections import Counter

# flatten all sentences into one list
all_tokens = []
for sent in all_sentences:
    all_tokens.extend(sent)

# count word frequencies
word_counts = Counter(all_tokens)

print("Total unique words:", len(word_counts))
print(word_counts.most_common(10))


Total unique words: 29149
[('the', 69256), ('of', 36778), ('to', 36400), ('in', 29247), ('and', 25648), ('said', 25362), ('mln', 18623), ('vs', 14340), ('for', 13781), ('dlrs', 12407)]


Step 2.2: Limit vocabulary size

In [None]:
MAX_VOCAB_SIZE = 20000 
MIN_COUNT = 5  # ignore very rare words
UNK_TOKEN = "<UNK>"

vocab = [
    word for word, count in word_counts.items()
    if count >= MIN_COUNT
]

vocab = vocab[:MAX_VOCAB_SIZE]

print("Final vocab size:", len(vocab))


Final vocab size: 10398


In [10]:
vocab = [UNK_TOKEN] + vocab
print("Final vocab size (with UNK):", len(vocab))
print("First 10 vocab items:", vocab[:10])

Final vocab size (with UNK): 10399
First 10 vocab items: ['<UNK>', 'asian', 'exporters', 'fear', 'damage', 'from', 'japan', 'mounting', 'trade', 'friction']


Step 2.3: Create word ‚Üî id mapping

Map each word to a unique integer index so it can be used by embedding layers


In [11]:
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
print("UNK token id:", word2idx[UNK_TOKEN])


UNK token id: 0


In [12]:
vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)


Vocabulary size: 10399


In [13]:
print("Max word index:", max(word2idx.values()))


Max word index: 10398


In [14]:
word2idx["oil"] 


414

In [15]:
word2idx["<UNK>"] == 0

True

Step 2.4: Convert sentences to word IDs

In [16]:
[word2idx[w] for w in sent if w in word2idx]


[4301,
 3847,
 706,
 159,
 945,
 1162,
 2205,
 1163,
 2205,
 1162,
 2189,
 2205,
 1163,
 2205,
 945,
 1163,
 2251,
 69,
 1163,
 69]

In [17]:
sent_ids = [
    word2idx.get(w, word2idx["<UNK>"])
    for w in sent
]


In [18]:
sentences_ids = []

for sent in all_sentences:
    ids = [
        word2idx.get(w, word2idx[UNK_TOKEN])
        for w in sent
    ]
    if len(ids) >= 2:
        sentences_ids.append(ids)


In [19]:
print("Number of sentences (ID form):", len(sentences_ids))
print("Example sentence (words):", all_sentences[0][:15])
print("Example sentence (ids):", sentences_ids[0][:15])

# test unknown word explicitly
print("Test unknown word -> id:",
      word2idx.get("thisworddoesnotexist", word2idx[UNK_TOKEN]))

Number of sentences (ID form): 53232
Example sentence (words): ['asian', 'exporters', 'fear', 'damage', 'from', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'and', 'japan', 'has']
Example sentence (ids): [1, 2, 3, 4, 5, 6, 0, 7, 8, 9, 10, 11, 12, 6, 13]
Test unknown word -> id: 0


In [20]:
import pickle
import os
os.makedirs("artifacts", exist_ok=True)

with open("artifacts/word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("artifacts/idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)


print("Vocabulary files saved")

Vocabulary files saved


Skip-gram pair generation

In [21]:
def generate_skipgram_pairs(sentences_ids, window_size=2):
    """
    Generate (center, context) word pairs for Skip-gram training.

    Args:
        sentences_ids (list of list of int): tokenized sentences as word indices
        window_size (int): context window size on each side

    Returns:
        list of (int, int): (center_word, context_word) pairs
    """
    pairs = []

    for sentence in sentences_ids:
        sentence_length = len(sentence)

        for center_pos, center_word in enumerate(sentence):
            start = max(0, center_pos - window_size)
            end = min(sentence_length, center_pos + window_size + 1)

            for context_pos in range(start, end):
                if context_pos != center_pos:
                    context_word = sentence[context_pos]
                    pairs.append((center_word, context_word))

    return pairs


In [22]:
pairs = generate_skipgram_pairs(sentences_ids, window_size=2)

print("Number of training pairs:", len(pairs))
print("First 5 pairs:", pairs[:5])


Number of training pairs: 4782684
First 5 pairs: [(1, 2), (1, 3), (2, 1), (2, 3), (2, 4)]


Skip-gram Dataset

In [23]:
class SkipGramDataset(Dataset):
    """
    PyTorch Dataset for Skip-gram (center, context) word pairs
    """
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center_word, context_word = self.pairs[idx]
        return (
            torch.tensor(center_word, dtype=torch.long),
            torch.tensor(context_word, dtype=torch.long)
        )


In [24]:
dataset = SkipGramDataset(pairs)

dataloader = DataLoader(
    dataset,
    batch_size=128,
    shuffle=True
)

print("Number of batches:", len(dataloader))


Number of batches: 37365


Skip-gram model

In [25]:
class SkipGram(nn.Module):
    """
    Skip-gram model using two embedding matrices:
    - input embeddings for center words
    - output embeddings for context words
    """
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words):
        """
        center_words: Tensor of shape (batch_size,)
        returns: center word embeddings of shape (batch_size, embedding_dim)
        """
        return self.input_embeddings(center_words)


In [None]:
# Create Skip-gram model
embedding_dim = 100 

skipgram_model = SkipGram(vocab_size, embedding_dim)
optimizer = torch.optim.Adam(skipgram_model.parameters(), lr=0.001)

# Track losses
skipgram_losses = []


Skip-gram loss function

In [27]:
def skipgram_loss(center_vectors, context_words, model):
    """
    center_vectors: (batch_size, embedding_dim)
    context_words: (batch_size,)
    """
    context_vectors = model.output_embeddings(context_words)

    # dot product between center and context vectors
    scores = torch.sum(center_vectors * context_vectors, dim=1)

    # negative log likelihood
    loss = -torch.mean(torch.log(torch.sigmoid(scores)))
    return loss


Training loop

In [28]:
def train_skipgram_earlystop(
    model,
    dataloader,
    optimizer,
    max_epochs=10,
    patience=2,
    min_delta=1e-4
):
    model.train()
    losses = []

    best_loss = float("inf")
    no_improve = 0

    for epoch in range(max_epochs):
        total_loss = 0.0

        for center_words, context_words in dataloader:
            optimizer.zero_grad()
            center_vectors = model(center_words)
            loss = skipgram_loss(center_vectors, context_words, model)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        losses.append(avg_loss)

        print(f"[SG] Epoch {epoch+1}, Loss: {avg_loss:.6f}")

        if best_loss - avg_loss < min_delta:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping triggered (Skip-gram)")
                break
        else:
            best_loss = avg_loss
            no_improve = 0

    return losses


Train Skip-gram

In [29]:
import time

start_time = time.time()

skipgram_losses = train_skipgram_earlystop(
    skipgram_model,
    dataloader,
    optimizer,
    max_epochs=10
)

skipgram_time = time.time() - start_time


KeyboardInterrupt: 

In [None]:
import os
import pickle
with open("artifacts/skipgram_model.pkl", "wb") as f:
    pickle.dump({
        "model_state_dict": skipgram_model.state_dict(),
        "word2idx": word2idx,
        "idx2word": idx2word,
        "vocab_size": vocab_size,
        "embedding_dim": embedding_dim
    }, f)


Negative Sampling Dataset

In [None]:
class SkipGramNegDataset(Dataset):
    def __init__(self, pairs, vocab_size, num_negatives=5):
        self.pairs = pairs
        self.vocab_size = vocab_size
        self.num_negatives = num_negatives

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center_word, pos_context = self.pairs[idx]

        neg_contexts = torch.randint(
            low=0,
            high=self.vocab_size,
            size=(self.num_negatives,),
            dtype=torch.long
        )

        return (
            torch.tensor(center_word, dtype=torch.long),
            torch.tensor(pos_context, dtype=torch.long),
            neg_contexts
        )


In [None]:
neg_dataset = SkipGramNegDataset(
    pairs,
    vocab_size=vocab_size,
    num_negatives=5
)


In [None]:
neg_dataloader = DataLoader(
    neg_dataset,
    batch_size=128,
    shuffle=True
)


Negative Sampling Loss

In [None]:
def neg_sampling_loss(center_vectors, pos_context, neg_contexts, model):
    pos_vectors = model.output_embeddings(pos_context)
    neg_vectors = model.output_embeddings(neg_contexts)

    pos_score = torch.sum(center_vectors * pos_vectors, dim=1)
    pos_loss = torch.log(torch.sigmoid(pos_score))

    neg_score = torch.bmm(
        neg_vectors, center_vectors.unsqueeze(2)
    ).squeeze()
    neg_loss = torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)

    return -(pos_loss + neg_loss).mean()


Training Loop (NEG)

In [None]:
def train_skipgram_neg_earlystop(
    model,
    dataloader,
    optimizer,
    max_epochs=10,
    patience=2,
    min_delta=1e-3
):
    model.train()
    losses = []
    best_loss = float("inf")
    no_improve = 0

    for epoch in range(max_epochs):
        total_loss = 0.0

        for center_words, pos_context, neg_contexts in dataloader:
            optimizer.zero_grad()
            center_vectors = model(center_words)
            loss = neg_sampling_loss(
                center_vectors, pos_context, neg_contexts, model
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        losses.append(avg_loss)
        print(f"[NEG] Epoch {epoch+1}, Loss: {avg_loss:.4f}")

        if best_loss - avg_loss < min_delta:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping triggered (NEG)")
                break
        else:
            best_loss = avg_loss
            no_improve = 0

    return losses


In [None]:
skipgram_neg_model = SkipGram(vocab_size, embedding_dim)
optimizer_neg = torch.optim.Adam(skipgram_neg_model.parameters(), lr=0.001)


Train NEG model

In [None]:
from torch.utils.data import Dataset, DataLoader

start_time = time.time()
neg_losses = train_skipgram_neg_earlystop(
    skipgram_neg_model,
    neg_dataloader,
    optimizer_neg,
    max_epochs=10
)
neg_time = time.time() - start_time




[NEG] Epoch 1, Loss: 5.7840
[NEG] Epoch 2, Loss: 1.5392
[NEG] Epoch 3, Loss: 1.2280
[NEG] Epoch 4, Loss: 1.1168
[NEG] Epoch 5, Loss: 1.0584
[NEG] Epoch 6, Loss: 1.0215
[NEG] Epoch 7, Loss: 0.9972
[NEG] Epoch 8, Loss: 0.9797
[NEG] Epoch 9, Loss: 0.9670
[NEG] Epoch 10, Loss: 0.9570


In [None]:
with open("artifacts/skipgram_neg_model.pkl", "wb") as f:
    pickle.dump({
        "model_state_dict": skipgram_neg_model.state_dict(),
        "word2idx": word2idx,
        "idx2word": idx2word,
        "vocab_size": vocab_size,
        "embedding_dim": embedding_dim
    }, f)


In [None]:
print(f"Skip-gram training time: {skipgram_time:.2f} s")
print(f"NEG training time: {neg_time:.2f} s")

Skip-gram training time: 811.89 s
NEG training time: 1532.75 s


GloVe

In [None]:
import numpy as np
import time
from collections import defaultdict


Build co-occurrence matrix

In [None]:
def build_cooccurrence(sentences_ids, window_size=2):
    cooc = defaultdict(float)

    for sent in sentences_ids:
        for i, wi in enumerate(sent):
            start = max(0, i - window_size)
            end = min(len(sent), i + window_size + 1)

            for j in range(start, end):
                if i != j:
                    wj = sent[j]
                    cooc[(wi, wj)] += 1.0 / abs(i - j)

    return cooc


cooc_matrix = build_cooccurrence(sentences_ids, window_size=2)
print("Co-occurrence pairs:", len(cooc_matrix))


Co-occurrence pairs: 1030764


initialize GloVe parameters

In [None]:
embedding_dim = 100
vocab_size = len(word2idx)

X_MAX = 100
ALPHA = 0.75
LR = 0.001 

rng = np.random.default_rng(42)

W = rng.normal(0, 0.01, (vocab_size, embedding_dim))
W_tilde = rng.normal(0, 0.01, (vocab_size, embedding_dim))
b = np.zeros(vocab_size)
b_tilde = np.zeros(vocab_size)


GloVe weighting function and training loop

In [None]:
def glove_weight(x, x_max=X_MAX, alpha=ALPHA):
    if x < x_max:
        return (x / x_max) ** alpha
    return 1.0

def train_glove(
    cooc_matrix,
    W,
    W_tilde,
    b,
    b_tilde,
    epochs=10,
    lr=LR,
    grad_clip=5.0
):
    losses = []

    for epoch in range(epochs):
        total_loss = 0.0

        for (i, j), x in cooc_matrix.items():
            if x <= 0:
                continue

            w_ij = glove_weight(x)

            dot = np.dot(W[i], W_tilde[j])
            dot = np.clip(dot, -10, 10)      # üî• prevents overflow

            diff = dot + b[i] + b_tilde[j] - np.log(x)
            loss = w_ij * diff * diff
            total_loss += loss

            grad = w_ij * diff
            grad = np.clip(grad, -grad_clip, grad_clip)

            W_i_old = W[i].copy()

            W[i] -= lr * grad * W_tilde[j]
            W_tilde[j] -= lr * grad * W_i_old
            b[i] -= lr * grad
            b_tilde[j] -= lr * grad

        losses.append(total_loss)
        print(f"[GloVe] Epoch {epoch+1}, Loss: {total_loss:.2e}")

        if not np.isfinite(total_loss):
            print("NaN detected ‚Äî stopping")
            break

    return losses


In [None]:
start = time.time()
glove_losses = train_glove(
    cooc_matrix,
    W,
    W_tilde,
    b,
    b_tilde,
    epochs=10,
    lr=0.001
)
print("Training time:", time.time() - start)


[GloVe] Epoch 1, Loss: 2.33e+05
[GloVe] Epoch 2, Loss: 1.65e+05
[GloVe] Epoch 3, Loss: 1.40e+05
[GloVe] Epoch 4, Loss: 1.27e+05
[GloVe] Epoch 5, Loss: 1.19e+05
[GloVe] Epoch 6, Loss: 1.13e+05
[GloVe] Epoch 7, Loss: 1.09e+05
[GloVe] Epoch 8, Loss: 1.06e+05
[GloVe] Epoch 9, Loss: 1.04e+05
[GloVe] Epoch 10, Loss: 1.02e+05
Training time: 70.42591309547424


In [None]:
glove_embeddings = W + W_tilde


In [None]:
start_time = time.time()

glove_time = time.time() - start_time


In [None]:
with open("artifacts/glove_model.pkl", "wb") as f:
    pickle.dump({
        "embeddings": glove_embeddings,
        "word2idx": word2idx,
        "idx2word": idx2word
    }, f)


Task 2 Evaluation (ALL 3 MODELS)

In [None]:
# ===== Extract trained embeddings =====

# Skip-gram
sg_embeddings = skipgram_model.input_embeddings.weight.detach().cpu().numpy()

# Negative Sampling
neg_embeddings = skipgram_neg_model.input_embeddings.weight.detach().cpu().numpy()

# GloVe
glove_embeddings = W + W_tilde


In [None]:
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / (norms + 1e-10)

sg_embeddings_norm = normalize_embeddings(sg_embeddings)
neg_embeddings_norm = normalize_embeddings(neg_embeddings)
glove_embeddings_norm = normalize_embeddings(glove_embeddings)


In [None]:
def load_analogy_dataset(path):
    analogies = []
    current_category = None

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Category line
            if line.startswith(":"):
                current_category = line[2:]
                continue

            parts = line.lower().split()

            if len(parts) != 4:
                continue

            a, b, c, d = parts
            analogies.append((current_category, a, b, c, d))

    return analogies

analogy_data = load_analogy_dataset("word-test.v1.txt")


def analogy_predict(a, b, c, embeddings, word2idx, idx2word):
    for w in (a, b, c):
        if w not in word2idx:
            return None

    va = embeddings[word2idx[a]]
    vb = embeddings[word2idx[b]]
    vc = embeddings[word2idx[c]]

    target = vb - va + vc
    target /= np.linalg.norm(target) + 1e-10

    sims = embeddings @ target
    for w in (a, b, c):
        sims[word2idx[w]] = -np.inf

    return idx2word[np.argmax(sims)]


In [None]:
def evaluate_analogies(analogies, embeddings, word2idx, idx2word):
    semantic_correct = semantic_total = 0
    syntactic_correct = syntactic_total = 0

    for category, a, b, c, d in analogies:
        pred = analogy_predict(a, b, c, embeddings, word2idx, idx2word)
        if pred is None:
            continue

        if category.startswith("gram"):
            syntactic_total += 1
            if pred == d:
                syntactic_correct += 1
        else:
            semantic_total += 1
            if pred == d:
                semantic_correct += 1

    return {
        "semantic_acc": semantic_correct / max(1, semantic_total),
        "syntactic_acc": syntactic_correct / max(1, syntactic_total),
    }


Run evaluation for ALL models

In [None]:
sg_analogy = evaluate_analogies(
    analogy_data,
    sg_embeddings_norm,
    word2idx,
    idx2word
)

neg_analogy = evaluate_analogies(
    analogy_data,
    neg_embeddings_norm,
    word2idx,
    idx2word
)

glove_analogy = evaluate_analogies(
    analogy_data,
    glove_embeddings_norm,
    word2idx,
    idx2word
)

print("Skip-gram:", sg_analogy)
print("Skip-gram + NEG:", neg_analogy)
print("GloVe:", glove_analogy)


Skip-gram: {'semantic_acc': 0.0, 'syntactic_acc': 0.0007015902712815715}
Skip-gram + NEG: {'semantic_acc': 0.020955574182732608, 'syntactic_acc': 0.011693171188026192}
GloVe: {'semantic_acc': 0.0, 'syntactic_acc': 0.00023386342376052386}


In [None]:
import pandas as pd

wordsim_df = pd.read_csv("wordsim353.csv")
wordsim_df.head()


Unnamed: 0,Word 1,Word 2,Human (Mean)
0,admission,ticket,5.536
1,alcohol,chemistry,4.125
2,aluminum,metal,6.625
3,announcement,effort,2.0625
4,announcement,news,7.1875


Utility functions (dot product + evaluation)

In [None]:
import numpy as np
from scipy.stats import spearmanr

def get_dot_product(w1, w2, embeddings, word2idx):
    if w1 not in word2idx or w2 not in word2idx:
        return None
    return np.dot(embeddings[word2idx[w1]], embeddings[word2idx[w2]])


Compute Spearman correlation

In [None]:
def evaluate_wordsim(embeddings, word2idx, wordsim_df):
    model_scores = []
    human_scores = []

    for _, row in wordsim_df.iterrows():
        w1 = row["Word 1"].lower()
        w2 = row["Word 2"].lower()
        human_score = row["Human (Mean)"]

        score = get_dot_product(w1, w2, embeddings, word2idx)
        if score is not None:
            model_scores.append(score)
            human_scores.append(human_score)

    corr, _ = spearmanr(model_scores, human_scores)
    return corr


In [None]:
sg_wordsim = evaluate_wordsim(sg_embeddings, word2idx, wordsim_df)
neg_wordsim = evaluate_wordsim(neg_embeddings, word2idx, wordsim_df)
glove_wordsim = evaluate_wordsim(glove_embeddings, word2idx, wordsim_df)

print("WordSim Spearman Correlation:")
print("Skip-gram:", sg_wordsim)
print("Skip-gram + NEG:", neg_wordsim)
print("GloVe:", glove_wordsim)


WordSim Spearman Correlation:
Skip-gram: -0.07303293005723953
Skip-gram + NEG: 0.26210855852372505
GloVe: 0.13667895006713254


| Model           | Epochs          | Training Time (s) | Final Loss     | WordSim (œÅ) | Semantic Acc | Syntactic Acc |
| --------------- | --------------- | ----------------- | -------------- | ----------- | ------------ | ------------- |
| Skip-gram       | ‚â§7 (early stop) | **811.89**        | ~0.0000        | **‚àí0.073**  | 0.0000       | 0.0007        |
| Skip-gram + NEG | 10              | **1532.75**       | **0.9570**     | **0.262**   | 0.0210       | 0.0117        |
| GloVe           | 10              | **70.43**         | **1.02 √ó 10‚Åµ** | **0.137**   | 0.0000       | 0.0002        |


SG < NEG < GloVe in time

Skip-gram with negative sampling requires substantially more training time than vanilla Skip-gram due to additional negative samples processed for each training pair. GloVe is significantly faster in our implementation because it operates on a sparse co-occurrence matrix rather than iterating over all

Overall analysis 

Skip-gram trains fast and the loss becomes very small, but it does not learn good word meaning. This is shown by the negative WordSim correlation and very low accuracy. Without negative sampling, the model cannot separate correct and incorrect contexts well.

Skip-gram with negative sampling takes longer time to train, but it gives the best results. It has higher semantic and syntactic accuracy and the highest correlation with human similarity scores. This shows that negative sampling helps the model learn better word representations.

GloVe trains much faster than Skip-gram models and the loss decreases steadily. However, its performance on analogy tasks is still low. This may be because the model is trained for a small number of epochs and uses a simple optimization method.