In [1]:

import re
from collections import Counter
import numpy as np
import nltk

nltk.download("brown")

from nltk.corpus import brown

[nltk_data] Downloading package brown to C:\Users\Samir
[nltk_data]     Pokharel\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
# usinf news category from brown corpus
sentences = brown.sents(categories="news")
print("Number of sentences:", len(sentences))

Number of sentences: 4623


In [3]:
# clean + tokenize(already tokenized in brown corpus just need to normalize)
def clean_token(tok: str):
    tok = tok.lower()
    tok = re.sub(r"[^a-z']+", "", tok)  # remove non-letters
    return tok

docs = []
for sent in sentences:
    cleaned = [clean_token(t) for t in sent]
    cleaned = [t for t in cleaned if t]  # drop empty tokens
    if len(cleaned) > 0:
        docs.append(cleaned)

print("Processed sentences:", len(docs))
print("Example sentence tokens:", docs[0][:30])

Processed sentences: 4608
Example sentence tokens: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place']


In [4]:
all_tokens = [w for s in docs for w in s]
print("Total tokens:", len(all_tokens))
print("Unique tokens:", len(set(all_tokens)))

Total tokens: 87757
Unique tokens: 12286


# vocabulary building + word↔id encoding + a dynamic window (default=2) pair generator

In [5]:
#Build vocabulary + encode sentences + dynamic window pair generator

from collections import Counter
import numpy as np
import random

In [6]:
min_count = 5          # drop rare words (helps speed + quality)
window_default = 2     # requirement: default window = 2
seed = 1337
random.seed(seed)
np.random.seed(seed)

In [7]:
# Build vocab
word_counts = Counter([w for sent in docs for w in sent])

# Keep words >= min_count
vocab = [w for w, c in word_counts.items() if c >= min_count]
vocab = sorted(vocab)  # stable ordering

# Add special token for unknown words 
UNK = "<unk>"
vocab = [UNK] + vocab

stoi = {w: i for i, w in enumerate(vocab)}
itos = {i: w for w, i in stoi.items()}

vocab_size = len(vocab)
print("Vocab size:", vocab_size)
print("Most common words:", word_counts.most_common(10))

Vocab size: 2490
Most common words: [('the', 6386), ('of', 2861), ('and', 2187), ('a', 2170), ('to', 2147), ('in', 2020), ('for', 970), ('that', 829), ('is', 733), ('was', 717)]


In [8]:
# Encode sentences (words -> ids)
def encode_sentence(sent):
    return [stoi.get(w, stoi[UNK]) for w in sent]

encoded_docs = [encode_sentence(sent) for sent in docs]

print("Example encoded sentence:", encoded_docs[0][:20])
print("Example decoded back:", [itos[i] for i in encoded_docs[0][:20]])

Example encoded sentence: [2218, 921, 550, 987, 1208, 1931, 913, 96, 1161, 1534, 0, 1830, 1740, 724, 1754, 1506, 774, 2, 2216, 112]
Example decoded back: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', '<unk>', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', "''", 'that', 'any']


In [9]:
# Dynamic window pair generator
def generate_skipgram_pairs(encoded_sents, window=window_default, dynamic=True):
    """
    encoded_sents: list[list[int]]
    window: max window size
    dynamic: if True, random window size per center word in [1, window]
    """
    for sent in encoded_sents:
        n = len(sent)
        for i, center in enumerate(sent):
            w = random.randint(1, window) if dynamic else window
            left = max(0, i - w)
            right = min(n, i + w + 1)
            for j in range(left, right):
                if j != i:
                    context = sent[j]
                    yield center, context

In [10]:
pair_gen = generate_skipgram_pairs(encoded_docs[:2], window=2, dynamic=True)
for _ in range(10):
    c, ctx = next(pair_gen)
    print("center:", itos[c], "-> context:", itos[ctx])


center: the -> context: fulton
center: the -> context: county
center: fulton -> context: the
center: fulton -> context: county
center: county -> context: the
center: county -> context: fulton
center: county -> context: grand
center: county -> context: jury
center: grand -> context: fulton
center: grand -> context: county


# Skipgram Training

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [12]:
# Training hyperparameters
emb_dim = 100
window = 2              # requirement default=2
dynamic_window = True   # requirement: dynamic window support
batch_size = 2048
steps = 3000            # increase later for better quality (e.g., 20000+)
lr = 3e-3

# How many sentences to sample from each step (controls speed)
sentences_per_step = 64

In [13]:
# Model: Skip-gram (full softmax)

class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, emb_dim)
        self.out = nn.Linear(emb_dim, vocab_size, bias=False)

        # Good init
        nn.init.normal_(self.in_embed.weight, mean=0.0, std=0.01)

    def forward(self, center_ids):
        # center_ids: (B,)
        v = self.in_embed(center_ids)          # (B, D)
        logits = self.out(v)                   # (B, V)
        return logits
    
model = SkipGram(vocab_size, emb_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [14]:
# Helper: sample a batch of (center, context) pairs
def sample_pairs_batch(encoded_sents, batch_size, window=2, dynamic=True, sentences_per_step=64):
    # pick some random sentences
    chosen = random.sample(encoded_sents, k=min(sentences_per_step, len(encoded_sents)))
    gen = generate_skipgram_pairs(chosen, window=window, dynamic=dynamic)

    centers = []
    contexts = []
    # gather pairs until batch is full (or generator ends)
    while len(centers) < batch_size:
        try:
            c, ctx = next(gen)
            centers.append(c)
            contexts.append(ctx)
        except StopIteration:
            # if we ran out, re-sample new sentences
            chosen = random.sample(encoded_sents, k=min(sentences_per_step, len(encoded_sents)))
            gen = generate_skipgram_pairs(chosen, window=window, dynamic=dynamic)

    centers = torch.tensor(centers, dtype=torch.long, device=device)
    contexts = torch.tensor(contexts, dtype=torch.long, device=device)
    return centers, contexts

In [15]:
# Train loop
import time


model.train()

# IMPORTANT for GPU timing
if device == "cuda":
    torch.cuda.synchronize()

start_time = time.perf_counter()

for step in range(1, steps + 1):
    center_ids, target_ctx_ids = sample_pairs_batch(
        encoded_docs,
        batch_size=batch_size,
        window=window,
        dynamic=dynamic_window,
        sentences_per_step=sentences_per_step
    )

    logits = model(center_ids)  # (B, V)
    loss = F.cross_entropy(logits, target_ctx_ids)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % 200 == 0:
        print(f"step {step}/{steps} | loss {loss.item():.4f}")

# synchronize again before stopping timer
if device == "cuda":
    torch.cuda.synchronize()

end_time = time.perf_counter()

train_time_sec = end_time - start_time

print("Done training Skip-gram (full softmax).")
print(f"Training time: {train_time_sec:.2f} seconds")


step 200/3000 | loss 5.6644
step 400/3000 | loss 5.3228
step 600/3000 | loss 5.0530
step 800/3000 | loss 5.4900
step 1000/3000 | loss 5.1489
step 1200/3000 | loss 5.2842
step 1400/3000 | loss 4.9151
step 1600/3000 | loss 4.9734
step 1800/3000 | loss 5.0238
step 2000/3000 | loss 4.7883
step 2200/3000 | loss 4.6731
step 2400/3000 | loss 4.8176
step 2600/3000 | loss 4.8152
step 2800/3000 | loss 4.6766
step 3000/3000 | loss 4.6450
Done training Skip-gram (full softmax).
Training time: 8.65 seconds


In [16]:
# Metrics dictionary 
skipgram_metrics = {
    "model": "Word2Vec Skip-gram (full softmax)",
    "window": window,
    "dynamic_window": dynamic_window,
    "embedding_dim": emb_dim,
    "batch_size": batch_size,
    "steps": steps,
    "training_time_sec": train_time_sec,
    "final_loss": loss.item()
}

skipgram_metrics


{'model': 'Word2Vec Skip-gram (full softmax)',
 'window': 2,
 'dynamic_window': True,
 'embedding_dim': 100,
 'batch_size': 2048,
 'steps': 3000,
 'training_time_sec': 8.647381499999995,
 'final_loss': 4.644984245300293}

# Evaluate trained embeddings (nearest neighbors + similarity)

In [17]:
import torch
import torch.nn.functional as F

# Get embedding matrix (V, D)
with torch.no_grad():
    W = model.in_embed.weight.detach().to("cpu")  # move to CPU for easy use

# Normalize once for cosine similarity
W_norm = F.normalize(W, p=2, dim=1)

In [18]:

def most_similar(word, topk=10):
    if word not in stoi:
        print(f"'{word}' not in vocab. Try another word.")
        return []
    wid = stoi[word]
    query = W_norm[wid]  # (D,)

    # cosine sim with all words
    sims = torch.mv(W_norm, query)  # (V,)
    sims[wid] = -1e9  # exclude itself

    vals, idxs = torch.topk(sims, k=topk)
    results = [(itos[i.item()], vals[j].item()) for j, i in enumerate(idxs)]
    return results

In [19]:
def similarity(w1, w2):
    if w1 not in stoi or w2 not in stoi:
        print("One of the words is not in vocab.")
        return None
    v1 = W_norm[stoi[w1]]
    v2 = W_norm[stoi[w2]]
    return float(torch.dot(v1, v2))

# ---- Try a few words (edit these to whatever you want) ----
test_words = ["market", "bank", "oil", "government", "war", "money"]

for w in test_words:
    if w in stoi:
        print(f"\nMost similar to '{w}':")
        for ww, s in most_similar(w, topk=8):
            print(f"  {ww:15s} {s:.3f}")


Most similar to 'market':
  textile         0.491
  underwater      0.480
  enjoyed         0.467
  stock           0.423
  value           0.416
  exchange        0.415
  power           0.413
  wall            0.412

Most similar to 'bank':
  grand           0.503
  honors          0.491
  chester         0.488
  establishment   0.479
  freedom         0.476
  boston          0.466
  monthly         0.453
  lodge           0.445

Most similar to 'oil':
  teamsters       0.528
  electric        0.524
  katanga         0.510
  cotton          0.497
  palm            0.496
  transportation  0.496
  hollywood       0.488
  minister        0.483

Most similar to 'government':
  situation       0.461
  prowestern      0.460
  measure         0.453
  patient         0.442
  coalition       0.437
  appeals         0.431
  ordinance       0.427
  united          0.424

Most similar to 'war':
  cold            0.484
  power           0.460
  camera          0.459
  changed         0.451
  com

In [20]:
print("\nSimilarity examples:")
pairs = [("money", "bank"), ("oil", "market"), ("war", "government")]
for a, b in pairs:
    if a in stoi and b in stoi:
        print(f"sim({a}, {b}) = {similarity(a,b):.3f}")


Similarity examples:
sim(money, bank) = 0.288
sim(oil, market) = 0.196
sim(war, government) = 0.249


# Word2Vec Skip-gram NEG (Negative Sampling)

In [21]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


# Hyperparameters

emb_dim = 100
window = 2
dynamic_window = True
batch_size = 4096
steps = 8000           # NEG usually needs more steps than full softmax
lr = 3e-3

sentences_per_step = 64
num_negatives = 10     # K negatives per positive


# Build negative sampling distribution (unigram^0.75)

# Count words in encoded_docs (ids)
id_counts = Counter([wid for sent in encoded_docs for wid in sent])

counts = np.zeros(vocab_size, dtype=np.float64)
for wid, c in id_counts.items():
    counts[wid] = c

# avoid issues if vocab[0] = <unk> has low count
counts = np.maximum(counts, 1.0)

unigram = counts ** 0.75
neg_probs = unigram / unigram.sum()

# We'll sample negatives with torch.multinomial using probs tensor on GPU
neg_probs_t = torch.tensor(neg_probs, dtype=torch.float32, device=device)


# Model: two embedding tables (input + output)

class SkipGramNEG(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, emb_dim)
        self.out_embed = nn.Embedding(vocab_size, emb_dim)

        nn.init.normal_(self.in_embed.weight, mean=0.0, std=0.01)
        nn.init.normal_(self.out_embed.weight, mean=0.0, std=0.01)

    def forward(self, center_ids, pos_ctx_ids, neg_ctx_ids):
        """
        center_ids: (B,)
        pos_ctx_ids: (B,)
        neg_ctx_ids: (B, K)
        """
        v = self.in_embed(center_ids)               # (B, D)
        u_pos = self.out_embed(pos_ctx_ids)         # (B, D)
        u_neg = self.out_embed(neg_ctx_ids)         # (B, K, D)

        # Positive score: v dot u_pos -> (B,)
        pos_score = torch.sum(v * u_pos, dim=1)

        # Negative scores: v dot u_neg -> (B, K)
        neg_score = torch.bmm(u_neg, v.unsqueeze(2)).squeeze(2)

        # Loss:
        # maximize log(sigmoid(pos)) + sum log(sigmoid(-neg))
        loss_pos = F.logsigmoid(pos_score).mean()
        loss_neg = F.logsigmoid(-neg_score).mean()
        loss = -(loss_pos + loss_neg)
        return loss

model_neg = SkipGramNEG(vocab_size, emb_dim).to(device)
optimizer = torch.optim.AdamW(model_neg.parameters(), lr=lr)


# Batch sampler: positives + negatives

def sample_pos_batch(encoded_sents, batch_size, window=2, dynamic=True, sentences_per_step=64):
    chosen = random.sample(encoded_sents, k=min(sentences_per_step, len(encoded_sents)))
    gen = generate_skipgram_pairs(chosen, window=window, dynamic=dynamic)

    centers, pos_ctx = [], []
    while len(centers) < batch_size:
        try:
            c, ctx = next(gen)
            centers.append(c)
            pos_ctx.append(ctx)
        except StopIteration:
            chosen = random.sample(encoded_sents, k=min(sentences_per_step, len(encoded_sents)))
            gen = generate_skipgram_pairs(chosen, window=window, dynamic=dynamic)

    centers = torch.tensor(centers, dtype=torch.long, device=device)
    pos_ctx = torch.tensor(pos_ctx, dtype=torch.long, device=device)
    return centers, pos_ctx

def sample_negatives(batch_size, K):
    # returns (B, K)
    return torch.multinomial(neg_probs_t, num_samples=batch_size * K, replacement=True).view(batch_size, K)


# Train loop with timing

model_neg.train()

if device == "cuda":
    torch.cuda.synchronize()
start_time = time.perf_counter()

for step in range(1, steps + 1):
    center_ids, pos_ctx_ids = sample_pos_batch(
        encoded_docs,
        batch_size=batch_size,
        window=window,
        dynamic=dynamic_window,
        sentences_per_step=sentences_per_step
    )
    neg_ctx_ids = sample_negatives(batch_size, num_negatives)

    loss = model_neg(center_ids, pos_ctx_ids, neg_ctx_ids)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step}/{steps} | loss {loss.item():.4f}")

if device == "cuda":
    torch.cuda.synchronize()
train_time_sec = time.perf_counter() - start_time

print("Done training Skip-gram NEG.")
print(f"Training time: {train_time_sec:.2f} seconds")

# Save metrics for Task 2 table
skipgram_neg_metrics = {
    "model": "Word2Vec Skip-gram NEG",
    "window": window,
    "dynamic_window": dynamic_window,
    "embedding_dim": emb_dim,
    "batch_size": batch_size,
    "steps": steps,
    "num_negatives": num_negatives,
    "training_time_sec": train_time_sec,
    "final_loss": loss.item()
}

skipgram_neg_metrics


Using device: cuda
step 500/8000 | loss 1.0873
step 1000/8000 | loss 0.9989
step 1500/8000 | loss 0.9317
step 2000/8000 | loss 0.8890
step 2500/8000 | loss 0.8651
step 3000/8000 | loss 0.8544
step 3500/8000 | loss 0.8275
step 4000/8000 | loss 0.8436
step 4500/8000 | loss 0.8355
step 5000/8000 | loss 0.8460
step 5500/8000 | loss 0.8296
step 6000/8000 | loss 0.8076
step 6500/8000 | loss 0.8160
step 7000/8000 | loss 0.7875
step 7500/8000 | loss 0.8186
step 8000/8000 | loss 0.7966
Done training Skip-gram NEG.
Training time: 70.81 seconds


{'model': 'Word2Vec Skip-gram NEG',
 'window': 2,
 'dynamic_window': True,
 'embedding_dim': 100,
 'batch_size': 4096,
 'steps': 8000,
 'num_negatives': 10,
 'training_time_sec': 70.8091306,
 'final_loss': 0.7965717315673828}

# Build GloVe co-occurrence matrix

In [22]:
from collections import defaultdict
import math
import time
import torch


# GloVe co-occurrence builder

window = 2
dynamic_window = True

# You can cap vocab if you want faster GloVe (optional)
# max_vocab = 20000
# (for now we'll keep full vocab)

def build_cooccurrence(encoded_sents, window=2, dynamic=True):
    """
    Builds sparse co-occurrence counts X(i,j) in a dict.
    Uses distance weighting: 1/distance (common in GloVe setups).
    """
    X = defaultdict(float)

    for sent in encoded_sents:
        n = len(sent)
        for i, wi in enumerate(sent):
            w = random.randint(1, window) if dynamic else window
            left = max(0, i - w)
            right = min(n, i + w + 1)

            for j in range(left, right):
                if i == j:
                    continue
                wj = sent[j]
                dist = abs(i - j)
                X[(wi, wj)] += 1.0 / dist  # distance weighting

    return X


# Build co-occurrence + time it

start = time.perf_counter()
X = build_cooccurrence(encoded_docs, window=window, dynamic=dynamic_window)
cooc_time_sec = time.perf_counter() - start

print("Co-occurrence pairs:", len(X))
print(f"Build time: {cooc_time_sec:.2f} seconds")

# Show a few example entries
for k in list(X.keys())[:5]:
    i, j = k
    print(f"X({itos[i]}, {itos[j]}) = {X[k]:.3f}")


Co-occurrence pairs: 86927
Build time: 0.32 seconds
X(the, fulton) = 6.500
X(fulton, the) = 6.000
X(fulton, county) = 6.000
X(county, fulton) = 6.000
X(county, grand) = 1.000


# Train GloVe

In [23]:
import torch
import torch.nn as nn
import time
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


# Convert sparse X dict -> tensors

# Each entry: (i, j, x_ij)
ijs = []
xijs = []
for (i, j), v in X.items():
    # Optional: drop very tiny counts (speeds up)
    if v >= 0.1:
        ijs.append((i, j))
        xijs.append(v)

ijs = torch.tensor(ijs, dtype=torch.long)
xijs = torch.tensor(xijs, dtype=torch.float32)

print("Training pairs after filtering:", len(xijs))

# Move to device
ijs = ijs.to(device)
xijs = xijs.to(device)


# GloVe hyperparameters

emb_dim = 100
epochs = 5              # raise later (e.g., 20-50) for better embeddings
lr = 0.05
batch_size = 4096

x_max = 100.0
alpha = 0.75


# GloVe model

class GloVe(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.wi = nn.Embedding(vocab_size, emb_dim)
        self.wj = nn.Embedding(vocab_size, emb_dim)
        self.bi = nn.Embedding(vocab_size, 1)
        self.bj = nn.Embedding(vocab_size, 1)

        nn.init.normal_(self.wi.weight, mean=0.0, std=0.01)
        nn.init.normal_(self.wj.weight, mean=0.0, std=0.01)
        nn.init.zeros_(self.bi.weight)
        nn.init.zeros_(self.bj.weight)

    def forward(self, i_ids, j_ids, x_ij):
        vi = self.wi(i_ids)                 # (B, D)
        vj = self.wj(j_ids)                 # (B, D)
        bi = self.bi(i_ids).squeeze(1)      # (B,)
        bj = self.bj(j_ids).squeeze(1)      # (B,)

        # weight function f(x)
        # f(x) = (x/x_max)^alpha if x < x_max else 1
        fx = torch.where(x_ij < x_max, (x_ij / x_max) ** alpha, torch.ones_like(x_ij))

        # GloVe loss: f(x) * (vi·vj + bi + bj - log(x))^2
        pred = (vi * vj).sum(dim=1) + bi + bj
        logx = torch.log(x_ij)
        loss = (fx * (pred - logx) ** 2).mean()
        return loss

glove = GloVe(vocab_size, emb_dim).to(device)
opt = torch.optim.AdamW(glove.parameters(), lr=lr)


# Training loop with timing

num_samples = xijs.size(0)
num_batches = math.ceil(num_samples / batch_size)

def batch_iter():
    # shuffle indices each epoch
    idx = torch.randperm(num_samples, device=device)
    for b in range(num_batches):
        batch_idx = idx[b * batch_size : (b + 1) * batch_size]
        ij = ijs[batch_idx]
        x = xijs[batch_idx]
        yield ij[:, 0], ij[:, 1], x

glove.train()

if device == "cuda":
    torch.cuda.synchronize()
start_time = time.perf_counter()

last_loss = None
for ep in range(1, epochs + 1):
    ep_loss = 0.0
    for i_ids, j_ids, x_ij in batch_iter():
        loss = glove(i_ids, j_ids, x_ij)

        opt.zero_grad(set_to_none=True)
        loss.backward()
        opt.step()

        ep_loss += loss.item()
        last_loss = loss.item()

    print(f"epoch {ep}/{epochs} | avg loss {ep_loss/num_batches:.4f}")

if device == "cuda":
    torch.cuda.synchronize()
train_time_sec = time.perf_counter() - start_time

print("Done training GloVe.")
print(f"Training time: {train_time_sec:.2f} seconds")

# Final embedding = wi + wj (common choice)
with torch.no_grad():
    glove_W = (glove.wi.weight + glove.wj.weight).detach().to("cpu")
    glove_W_norm = F.normalize(glove_W, p=2, dim=1)

# Save metrics for Task 2 table
glove_metrics = {
    "model": "GloVe",
    "window": window,
    "dynamic_window": dynamic_window,
    "embedding_dim": emb_dim,
    "batch_size": batch_size,
    "epochs": epochs,
    "cooc_build_time_sec": cooc_time_sec,
    "training_time_sec": train_time_sec,
    "final_loss": float(last_loss) if last_loss is not None else None,
    "num_cooc_pairs": int(len(X)),
    "num_train_pairs": int(num_samples),
}

glove_metrics


Using device: cuda
Training pairs after filtering: 86927
epoch 1/5 | avg loss 0.0864
epoch 2/5 | avg loss 0.0430
epoch 3/5 | avg loss 0.0354
epoch 4/5 | avg loss 0.0300
epoch 5/5 | avg loss 0.0259
Done training GloVe.
Training time: 0.92 seconds


{'model': 'GloVe',
 'window': 2,
 'dynamic_window': True,
 'embedding_dim': 100,
 'batch_size': 4096,
 'epochs': 5,
 'cooc_build_time_sec': 0.3200208000000089,
 'training_time_sec': 0.9200410000000261,
 'final_loss': 0.0455159991979599,
 'num_cooc_pairs': 86927,
 'num_train_pairs': 86927}

# Task 2

In [24]:
# Task 2 - Step 1: Build comparison table + export CSV

import pandas as pd

# Put all metrics dicts into a list (only include ones that exist)
rows = []
for d in ["skipgram_metrics", "skipgram_neg_metrics", "glove_metrics"]:
    if d in globals():
        rows.append(globals()[d])
    else:
        print(f"Warning: {d} not found (did you run that model cell?)")

df = pd.DataFrame(rows)

# Reorder columns nicely (keep only columns that exist)
preferred_cols = [
    "model",
    "window",
    "dynamic_window",
    "embedding_dim",
    "batch_size",
    "steps",
    "epochs",
    "num_negatives",
    "cooc_build_time_sec",
    "training_time_sec",
    "final_loss",
    "num_cooc_pairs",
    "num_train_pairs",
]
df = df[[c for c in preferred_cols if c in df.columns]]

# Add a total-time column (useful for comparing GloVe fairly)
if "cooc_build_time_sec" in df.columns:
    df["total_time_sec"] = df.get("cooc_build_time_sec", 0).fillna(0) + df.get("training_time_sec", 0).fillna(0)
else:
    df["total_time_sec"] = df.get("training_time_sec", 0).fillna(0)

# Make times easier to read
for c in ["cooc_build_time_sec", "training_time_sec", "total_time_sec"]:
    if c in df.columns:
        df[c] = df[c].astype(float).round(2)

if "final_loss" in df.columns:
    df["final_loss"] = df["final_loss"].astype(float).round(4)

# Show the table
df


Unnamed: 0,model,window,dynamic_window,embedding_dim,batch_size,steps,epochs,num_negatives,cooc_build_time_sec,training_time_sec,final_loss,num_cooc_pairs,num_train_pairs,total_time_sec
0,Word2Vec Skip-gram (full softmax),2,True,100,2048,3000.0,,,,8.65,4.645,,,8.65
1,Word2Vec Skip-gram NEG,2,True,100,4096,8000.0,,10.0,,70.81,0.7966,,,70.81
2,GloVe,2,True,100,4096,,5.0,,0.32,0.92,0.0455,86927.0,86927.0,1.24


In [25]:
# Save as CSV for your report / submission
df.to_csv("task2_metrics_comparison.csv", index=False)
print("Saved: task2_metrics_comparison.csv")


Saved: task2_metrics_comparison.csv


In [26]:
# Task 2 - Step 2A: Clean report-style Markdown table

report_cols = [
    "model",
    "window",
    "embedding_dim",
    "training_time_sec",
    "total_time_sec",
    "final_loss",
]

report_cols = [c for c in report_cols if c in df.columns]
report_df = df[report_cols].copy()

# Rename columns for readability
report_df = report_df.rename(columns={
    "model": "Model",
    "window": "Window",
    "embedding_dim": "Dim",
    "training_time_sec": "Train Time (s)",
    "total_time_sec": "Total Time (s)",
    "final_loss": "Final Loss",
})

report_df


Unnamed: 0,Model,Window,Dim,Train Time (s),Total Time (s),Final Loss
0,Word2Vec Skip-gram (full softmax),2,100,8.65,8.65,4.645
1,Word2Vec Skip-gram NEG,2,100,70.81,70.81,0.7966
2,GloVe,2,100,0.92,1.24,0.0455


In [27]:
# Task 2 - Step 2C: Automatic analysis helpers

analysis = {}

# Fastest training
analysis["fastest_model"] = df.loc[df["training_time_sec"].idxmin(), "model"]

# Best (lowest) loss
analysis["best_loss_model"] = df.loc[df["final_loss"].idxmin(), "model"]

# Most expensive total time
analysis["slowest_total_model"] = df.loc[df["total_time_sec"].idxmax(), "model"]

analysis


{'fastest_model': 'GloVe',
 'best_loss_model': 'GloVe',
 'slowest_total_model': 'Word2Vec Skip-gram NEG'}

# Word Analogy Accuracy (semantic + syntactic)

In [29]:
import urllib.request
import numpy as np
import torch
import torch.nn.functional as F


# Download Mikolov analogy dataset (word-test.v1.txt)

analogy_url = "http://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt"
analogy_path = "word-test.v1.txt"

urllib.request.urlretrieve(analogy_url, analogy_path)
print("Downloaded:", analogy_path)


# Helper: build an embedding accessor for YOUR models

def build_embed_lookup_from_torch(itos, stoi, weight_tensor_cpu):
    """
    weight_tensor_cpu: torch.Tensor on CPU, shape (V, D)
    returns: dict-like accessor with:
      - vectors (torch tensor normalized)
      - stoi/itos
    """
    W = weight_tensor_cpu.float()
    Wn = F.normalize(W, p=2, dim=1)
    return {"stoi": stoi, "itos": itos, "W": W, "Wn": Wn}

# Skipgram full-softmax embeddings
skipgram_lookup = build_embed_lookup_from_torch(
    itos, stoi,
    model.in_embed.weight.detach().cpu()
)

# Skipgram NEG embeddings
skipgram_neg_lookup = build_embed_lookup_from_torch(
    itos, stoi,
    model_neg.in_embed.weight.detach().cpu()
)

# GloVe embeddings (we created glove_W_norm earlier)
# glove_W is (V,D) on CPU from Step 7
glove_lookup = build_embed_lookup_from_torch(
    itos, stoi,
    glove_W  # from your GloVe training cell
)


# Load GloVe (gensim) baseline (optional but requested)

import gensim.downloader as api

print("Loading GloVe (gensim) glove-wiki-gigaword-100 ...")
glove_gensim = api.load("glove-wiki-gigaword-100")  # 100d, 400K vocab :contentReference[oaicite:3]{index=3}
print("Loaded gensim model vocab:", len(glove_gensim))


# Parse analogy file sections
# Only use:
#  - semantic: : capital-common-countries
#  - syntactic: : gram8-past-tense

def load_analogy_questions(path, wanted_sections):
    data = {sec: [] for sec in wanted_sections}
    current = None
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(":"):
                name = line[1:].strip()
                current = name if name in wanted_sections else None
                continue
            if current is None:
                continue
            parts = line.split()
            if len(parts) == 4:
                a, b, c, d = parts
                data[current].append((a.lower(), b.lower(), c.lower(), d.lower()))
    return data

wanted = ["capital-common-countries", "gram8-past-tense"]
analogy_data = load_analogy_questions(analogy_path, wanted)
for sec in wanted:
    print(sec, "questions:", len(analogy_data[sec]))


# Analogy evaluation:
# 3CosAdd: argmax cos( b - a + c, w )
# using normalized vectors
# report accuracy on questions where all words are in vocab

def analogy_accuracy_torch(lookup, questions, topk=1):
    stoi_local = lookup["stoi"]
    Wn = lookup["Wn"]  # (V,D), normalized

    total = 0
    correct = 0
    skipped = 0

    for a, b, c, d in questions:
        if (a not in stoi_local) or (b not in stoi_local) or (c not in stoi_local) or (d not in stoi_local):
            skipped += 1
            continue

        va = Wn[stoi_local[a]]
        vb = Wn[stoi_local[b]]
        vc = Wn[stoi_local[c]]

        query = F.normalize(vb - va + vc, p=2, dim=0)

        sims = torch.mv(Wn, query)  # (V,)
        # exclude input words from being predicted
        sims[stoi_local[a]] = -1e9
        sims[stoi_local[b]] = -1e9
        sims[stoi_local[c]] = -1e9

        pred = torch.argmax(sims).item()
        pred_word = lookup["itos"][pred]

        total += 1
        if pred_word == d:
            correct += 1

    acc = (correct / total) if total > 0 else 0.0
    return acc, total, skipped

def analogy_accuracy_gensim(model, questions):
    total = 0
    correct = 0
    skipped = 0
    for a, b, c, d in questions:
        if any(w not in model for w in [a, b, c, d]):
            skipped += 1
            continue
        # most_similar(positive=[b,c], negative=[a])
        pred_word = model.most_similar(positive=[b, c], negative=[a], topn=1)[0][0]
        total += 1
        if pred_word == d:
            correct += 1
    acc = (correct / total) if total > 0 else 0.0
    return acc, total, skipped


# Evaluate for the two required sections

results_analogy = []

# Semantic: capital-common-countries
semantic_q = analogy_data["capital-common-countries"]

# Syntactic: gram8-past-tense
syntactic_q = analogy_data["gram8-past-tense"]

for name, lookup in [
    ("Skipgram", skipgram_lookup),
    ("Skipgram (NEG)", skipgram_neg_lookup),
    ("GloVe", glove_lookup),
]:
    sem_acc, sem_total, sem_skipped = analogy_accuracy_torch(lookup, semantic_q)
    syn_acc, syn_total, syn_skipped = analogy_accuracy_torch(lookup, syntactic_q)

    results_analogy.append({
        "model": name,
        "semantic_acc": sem_acc,
        "semantic_total": sem_total,
        "semantic_skipped": sem_skipped,
        "syntactic_acc": syn_acc,
        "syntactic_total": syn_total,
        "syntactic_skipped": syn_skipped,
    })

# gensim GloVe baseline
sem_acc, sem_total, sem_skipped = analogy_accuracy_gensim(glove_gensim, semantic_q)
syn_acc, syn_total, syn_skipped = analogy_accuracy_gensim(glove_gensim, syntactic_q)

results_analogy.append({
    "model": "GloVe (gensim)",
    "semantic_acc": sem_acc,
    "semantic_total": sem_total,
    "semantic_skipped": sem_skipped,
    "syntactic_acc": syn_acc,
    "syntactic_total": syn_total,
    "syntactic_skipped": syn_skipped,
})

import pandas as pd
analogy_df = pd.DataFrame(results_analogy)

# Convert to % for display
analogy_df["semantic_acc_%"] = (analogy_df["semantic_acc"] * 100).round(2)
analogy_df["syntactic_acc_%"] = (analogy_df["syntactic_acc"] * 100).round(2)

analogy_df[["model", "syntactic_acc_%", "semantic_acc_%", "syntactic_total", "semantic_total", "syntactic_skipped", "semantic_skipped"]]


Downloaded: word-test.v1.txt
Loading GloVe (gensim) glove-wiki-gigaword-100 ...
Loaded gensim model vocab: 400000
capital-common-countries questions: 506
gram8-past-tense questions: 0


Unnamed: 0,model,syntactic_acc_%,semantic_acc_%,syntactic_total,semantic_total,syntactic_skipped,semantic_skipped
0,Skipgram,0.0,0.0,0,20,0,486
1,Skipgram (NEG),0.0,5.0,0,20,0,486
2,GloVe,0.0,0.0,0,20,0,486
3,GloVe (gensim),0.0,93.87,0,506,0,0


# WordSim353 similarity correlation (Spearman)

In [33]:
import pandas as pd
from gensim.test.utils import datapath

ws_path = datapath("wordsim353.tsv")
print("Using:", ws_path)

# Read as raw text lines (most robust)
with open(ws_path, "r", encoding="utf-8", errors="ignore") as f:
    lines = [ln.strip() for ln in f if ln.strip()]

rows = []
for ln in lines:
    # split on whitespace or tabs
    parts = ln.split()
    if len(parts) < 3:
        continue
    w1, w2, score = parts[0], parts[1], parts[2]
    # skip header-like rows
    if w1.lower() in {"word1", "word_1"} or score.lower() in {"score", "similarity"}:
        continue
    rows.append((w1.lower(), w2.lower(), score))

ws = pd.DataFrame(rows, columns=["Word 1", "Word 2", "Human (mean)"])
ws["Human (mean)"] = pd.to_numeric(ws["Human (mean)"], errors="coerce")
ws = ws.dropna()

print("Final WordSim shape:", ws.shape)
ws.head()


Using: c:\Users\Samir Pokharel\OneDrive\Desktop\Natural Language Processing\.venv\lib\site-packages\gensim\test\test_data\wordsim353.tsv
Final WordSim shape: (354, 3)


Unnamed: 0,Word 1,Word 2,Human (mean)
1,#,word,1.0
2,love,sex,6.77
3,tiger,cat,7.35
4,tiger,tiger,10.0
5,book,paper,7.46


In [34]:
import numpy as np
import torch


def similarity_scores_torch(lookup, ws_df):
    """
    lookup: dict with keys {stoi, Wn}
    ws_df: DataFrame with columns ['Word 1', 'Word 2', 'Human (mean)']
    """
    stoi_local = lookup["stoi"]
    Wn = lookup["Wn"]  # normalized embeddings

    sims = []
    gold = []
    skipped = 0

    for _, row in ws_df.iterrows():
        w1 = row["Word 1"]
        w2 = row["Word 2"]
        score = row["Human (mean)"]

        if w1 not in stoi_local or w2 not in stoi_local:
            skipped += 1
            continue

        v1 = Wn[stoi_local[w1]]
        v2 = Wn[stoi_local[w2]]

        sim = torch.dot(v1, v2).item()  # cosine similarity
        sims.append(sim)
        gold.append(score)

    return np.array(sims), np.array(gold), skipped





def similarity_scores_gensim(model, ws_df):
    sims = []
    gold = []
    skipped = 0

    for _, row in ws_df.iterrows():
        w1 = row["Word 1"]
        w2 = row["Word 2"]
        score = row["Human (mean)"]

        if w1 not in model or w2 not in model:
            skipped += 1
            continue

        sim = model.similarity(w1, w2)
        sims.append(sim)
        gold.append(score)

    return np.array(sims), np.array(gold), skipped


In [35]:
from scipy.stats import spearmanr
import pandas as pd

results_similarity = []

# Skipgram
sims, gold, skipped = similarity_scores_torch(skipgram_lookup, ws)
rho, _ = spearmanr(sims, gold)
mse = np.mean((sims - gold) ** 2)
results_similarity.append({"model": "Skipgram", "spearman": rho, "mse": mse, "skipped": skipped})

# Skipgram NEG
sims, gold, skipped = similarity_scores_torch(skipgram_neg_lookup, ws)
rho, _ = spearmanr(sims, gold)
mse = np.mean((sims - gold) ** 2)
results_similarity.append({"model": "Skipgram (NEG)", "spearman": rho, "mse": mse, "skipped": skipped})

# GloVe (your training)
sims, gold, skipped = similarity_scores_torch(glove_lookup, ws)
rho, _ = spearmanr(sims, gold)
mse = np.mean((sims - gold) ** 2)
results_similarity.append({"model": "GloVe", "spearman": rho, "mse": mse, "skipped": skipped})

# GloVe (gensim baseline)
sims, gold, skipped = similarity_scores_gensim(glove_gensim, ws)
rho, _ = spearmanr(sims, gold)
mse = np.mean((sims - gold) ** 2)
results_similarity.append({"model": "GloVe (gensim)", "spearman": rho, "mse": mse, "skipped": skipped})

sim_df = pd.DataFrame(results_similarity)
sim_df["spearman"] = sim_df["spearman"].round(3)
sim_df["mse"] = sim_df["mse"].round(3)

sim_df


Unnamed: 0,model,spearman,mse,skipped
0,Skipgram,0.277,33.791,259
1,Skipgram (NEG),0.042,33.911,259
2,GloVe,0.189,35.915,259
3,GloVe (gensim),0.536,33.298,0


In [36]:
import pandas as pd
import numpy as np


base = df.copy()


keep = ["model", "window", "training_time_sec", "final_loss"]
base = base[[c for c in keep if c in base.columns]].copy()


base = base.rename(columns={
    "model": "Model",
    "window": "Window Size",
    "training_time_sec": "Training time",
    "final_loss": "Training Loss",
})



ana = analogy_df.copy()
ana = ana[["model", "syntactic_acc_%", "semantic_acc_%"]].rename(columns={
    "model": "Model",
    "syntactic_acc_%": "Syntactic Accuracy",
    "semantic_acc_%": "Semantic accuracy",
})



sim = sim_df.copy()
sim = sim[["model", "spearman", "mse"]].rename(columns={
    "model": "Model",
    "spearman": "Spearman",
    "mse": "MSE",
})

final_table = base.merge(ana, on="Model", how="left").merge(sim, on="Model", how="left")


for col in ["Training Loss", "Training time", "Spearman", "MSE"]:
    if col in final_table.columns:
        final_table[col] = pd.to_numeric(final_table[col], errors="coerce")

final_table["Training Loss"] = final_table["Training Loss"].round(4)
final_table["Training time"] = final_table["Training time"].round(2)
final_table["Spearman"] = final_table["Spearman"].round(3)
final_table["MSE"] = final_table["MSE"].round(3)


final_table["Syntactic Accuracy"] = final_table["Syntactic Accuracy"].round(2)
final_table["Semantic accuracy"] = final_table["Semantic accuracy"].round(2)


final_cols = [
    "Model",
    "Window Size",
    "Training Loss",
    "Training time",
    "Syntactic Accuracy",
    "Semantic accuracy",
    "Spearman",
    "MSE"
]
final_table = final_table[[c for c in final_cols if c in final_table.columns]]

final_table


Unnamed: 0,Model,Window Size,Training Loss,Training time,Syntactic Accuracy,Semantic accuracy,Spearman,MSE
0,Word2Vec Skip-gram (full softmax),2,4.645,8.65,,,,
1,Word2Vec Skip-gram NEG,2,0.7966,70.81,,,,
2,GloVe,2,0.0455,0.92,0.0,0.0,0.189,35.915


In [37]:
final_table.to_csv("task2_final_table.csv", index=False)
print("Saved: task2_final_table.csv")

Saved: task2_final_table.csv


In [38]:
# Task 3 - Step 1: Build searchable contexts (sentences)

import re
import numpy as np
import torch
import nltk
from nltk.corpus import brown

nltk.download("brown")

# Use Brown news sentences (already tokenized)
raw_sents = brown.sents(categories="news")

def clean_token(tok: str):
    tok = tok.lower()
    tok = re.sub(r"[^a-z']+", "", tok)
    return tok

# Clean sentences and also keep original text for display
contexts_tokens = []
contexts_text = []

for sent in raw_sents:
    cleaned = [clean_token(t) for t in sent]
    cleaned = [t for t in cleaned if t]
    if len(cleaned) >= 5:  # keep non-trivial sentences
        contexts_tokens.append(cleaned)
        contexts_text.append(" ".join(cleaned))

print("Contexts:", len(contexts_text))
print("Example:", contexts_text[0])


[nltk_data] Downloading package brown to C:\Users\Samir
[nltk_data]     Pokharel\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Contexts: 4211
Example: the fulton county grand jury said friday an investigation of atlanta's recent primary election produced no evidence '' that any irregularities took place


In [39]:
# Task 3 - Step 2: Choose embedding + precompute sentence vectors

import numpy as np
import torch
import torch.nn.functional as F

# ---- Choose which embeddings to use ----
# Options: "skipgram", "skipgram_neg", "glove"
EMBED_MODEL = "skipgram_neg"

# Get embedding matrix from your trained models
if EMBED_MODEL == "skipgram":
    W = model.in_embed.weight.detach().cpu()
elif EMBED_MODEL == "skipgram_neg":
    W = model_neg.in_embed.weight.detach().cpu()
elif EMBED_MODEL == "glove":
    W = glove_W.detach().cpu()  # from your GloVe training
else:
    raise ValueError("Invalid EMBED_MODEL")

# Normalize word embeddings so dot-product == cosine similarity
Wn = F.normalize(W.float(), p=2, dim=1)

# ---- Helpers ----
UNK_ID = stoi.get("<unk>", 0)

def sent_to_vec(tokens):
    ids = [stoi.get(w, UNK_ID) for w in tokens]
    vecs = Wn[ids]  # (L, D)
    v = vecs.mean(dim=0)
    v = F.normalize(v, p=2, dim=0)
    return v

# ---- Precompute context vectors (CPU tensor) ----
# (this makes search fast)
context_vecs = torch.stack([sent_to_vec(toks) for toks in contexts_tokens])  # (N, D)

print("context_vecs shape:", context_vecs.shape)
print("Using model:", EMBED_MODEL)


context_vecs shape: torch.Size([4211, 100])
Using model: skipgram_neg


In [40]:
# Task 3 - Step 3: Dot-product search (Top 10)

import re
import torch
import torch.nn.functional as F

def tokenize_query(q: str):
    q = q.lower()
    return re.findall(r"[a-z]+(?:'[a-z]+)?", q)

def query_to_vec(query: str):
    toks = tokenize_query(query)
    if len(toks) == 0:
        return None
    ids = [stoi.get(w, UNK_ID) for w in toks]
    v = Wn[ids].mean(dim=0)
    v = F.normalize(v, p=2, dim=0)
    return v

def search_topk(query: str, k=10):
    qv = query_to_vec(query)
    if qv is None:
        return []

    # Dot product (cosine because normalized)
    scores = torch.mv(context_vecs, qv)  # (N,)
    vals, idxs = torch.topk(scores, k=k)

    results = []
    for score, idx in zip(vals.tolist(), idxs.tolist()):
        results.append({
            "score": float(score),
            "text": contexts_text[idx]
        })
    return results

# ---- Quick test ----
test_query = "oil prices"
hits = search_topk(test_query, k=10)

print("Query:", test_query)
for i, h in enumerate(hits, 1):
    print(f"{i:02d}. ({h['score']:.3f}) {h['text']}")


Query: oil prices
01. (0.591) union oil co of california tuesday offered million in debentures to the public through a group of underwriters headed by dillon read co to raise money to retire a similar amount held by gulf oil corp
02. (0.582) they provided a social security system which covered all their african employes
03. (0.574) the company began operation in with hardware and oil mill supplies
04. (0.574) american stock exchange prices enjoyed a fairly solid rise but here also trading dwindled
05. (0.571) higher toll rates also are helping boost revenues
06. (0.566) the higher price supports provided by the new legislation together with rising prices for farm products are pushing up farm income making it possible for farmers to afford the new machinery
07. (0.562) the british coal industry is unprofitable has large coal stocks it can't sell
08. (0.562) within a year without reducing wages underwood's production costs were cut one third prices were slashed
09. (0.553) a large wellst

In [41]:
import torch
import torch.nn.functional as F

def build_Wn_from_weight(weight_cpu):
    return F.normalize(weight_cpu.float(), p=2, dim=1)

def sent_to_vec_with_Wn(tokens, Wn):
    ids = [stoi.get(w, UNK_ID) for w in tokens]
    v = Wn[ids].mean(dim=0)
    return F.normalize(v, p=2, dim=0)

# Build normalized word vectors for each model
Wn_skipgram = build_Wn_from_weight(model.in_embed.weight.detach().cpu())
Wn_neg      = build_Wn_from_weight(model_neg.in_embed.weight.detach().cpu())
Wn_glove    = build_Wn_from_weight(glove_W.detach().cpu())

# Precompute context vectors for each model
context_vecs_skipgram = torch.stack([sent_to_vec_with_Wn(toks, Wn_skipgram) for toks in contexts_tokens])
context_vecs_neg      = torch.stack([sent_to_vec_with_Wn(toks, Wn_neg)      for toks in contexts_tokens])
context_vecs_glove    = torch.stack([sent_to_vec_with_Wn(toks, Wn_glove)    for toks in contexts_tokens])

torch.save({
    "stoi": stoi,
    "UNK_ID": UNK_ID,
    "contexts_text": contexts_text,

    "models": {
        "skipgram": {"Wn": Wn_skipgram, "context_vecs": context_vecs_skipgram},
        "neg":      {"Wn": Wn_neg,      "context_vecs": context_vecs_neg},
        "glove":    {"Wn": Wn_glove,    "context_vecs": context_vecs_glove},
    }
}, "search_data.pt")

print("Saved search_data.pt with models: skipgram, neg, glove")
print("Context vecs shape:", context_vecs_skipgram.shape)


Saved search_data.pt with models: skipgram, neg, glove
Context vecs shape: torch.Size([4211, 100])


In [42]:
%%writefile app.py
from flask import Flask, request, render_template_string
import re
import torch
import torch.nn.functional as F

DATA_PATH = "search_data.pt"
app = Flask(__name__)

TEMPLATE = """
<!doctype html>
<html>
  <head>
    <title>NLP Search Engine</title>
    <style>
      body { font-family: Arial, sans-serif; margin: 40px; }
      input[type=text] { width: 420px; padding: 10px; font-size: 16px; }
      select { padding: 10px; font-size: 16px; margin-left: 8px; }
      button { padding: 10px 16px; font-size: 16px; margin-left: 8px; }
      .result { margin-top: 14px; padding: 10px; border: 1px solid #ddd; border-radius: 8px; }
      .meta { color: #555; font-size: 14px; margin-top: 6px; }
      .score { color: #555; font-size: 14px; }
    </style>
  </head>
  <body>
    <h2>Search Similar Context (Dot Product)</h2>

    <form method="GET" action="/">
      <input type="text" name="q" placeholder="Type your query..." value="{{q|default('')}}" />

      <select name="m">
        {% for key, label in model_options %}
          <option value="{{key}}" {% if key == model_selected %}selected{% endif %}>{{label}}</option>
        {% endfor %}
      </select>

      <button type="submit">Search</button>
    </form>

    {% if results is not none %}
      <div class="meta">
        Model: <b>{{ model_selected }}</b>
      </div>

      <h3>Top 10 Results</h3>
      {% if results|length == 0 %}
        <p>No results (empty query).</p>
      {% endif %}

      {% for r in results %}
        <div class="result">
          <div class="score">Score: {{ "%.3f"|format(r.score) }}</div>
          <div>{{ r.text }}</div>
        </div>
      {% endfor %}
    {% endif %}
  </body>
</html>
"""

def tokenize_query(q: str):
    q = q.lower()
    return re.findall(r"[a-z]+(?:'[a-z]+)?", q)

DATA = torch.load(DATA_PATH, map_location="cpu")
stoi = DATA["stoi"]
UNK_ID = DATA["UNK_ID"]
contexts_text = DATA["contexts_text"]
MODEL_STORE = DATA["models"]  # dict: skipgram/neg/glove

MODEL_OPTIONS = [
    ("skipgram", "Skipgram"),
    ("neg", "Skipgram (NEG)"),
    ("glove", "GloVe"),
]

def query_to_vec(query: str, Wn):
    toks = tokenize_query(query)
    if len(toks) == 0:
        return None
    ids = [stoi.get(w, UNK_ID) for w in toks]
    v = Wn[ids].mean(dim=0)
    v = F.normalize(v, p=2, dim=0)
    return v

def search_topk(query: str, model_key: str, k=10):
    model_key = model_key if model_key in MODEL_STORE else "neg"
    Wn = MODEL_STORE[model_key]["Wn"]
    context_vecs = MODEL_STORE[model_key]["context_vecs"]

    qv = query_to_vec(query, Wn)
    if qv is None:
        return []

    scores = torch.mv(context_vecs, qv)
    vals, idxs = torch.topk(scores, k=min(k, scores.numel()))

    out = []
    for s, idx in zip(vals.tolist(), idxs.tolist()):
        out.append(type("R", (), {"score": float(s), "text": contexts_text[idx]}))
    return out

@app.route("/", methods=["GET"])
def index():
    q = request.args.get("q", "").strip()
    m = request.args.get("m", "neg").strip()

    results = None
    if q != "":
        results = search_topk(q, model_key=m, k=10)
    elif "q" in request.args:
        results = []

    return render_template_string(
        TEMPLATE,
        q=q,
        results=results,
        model_selected=m if m in MODEL_STORE else "neg",
        model_options=MODEL_OPTIONS
    )

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=False)


Writing app.py


# Observation

As observed, for a window size of 2, the Word2Vec Skip-gram (full softmax) model recorded the highest training loss (4.6450), indicating difficulty in learning meaningful representations using full softmax on a relatively small corpus. In contrast, Skip-gram with Negative Sampling achieved a significantly lower loss of 0.7966, while GloVe obtained the lowest training loss (0.0455), showing faster and more stable convergence.

In terms of training time, Skip-gram with Negative Sampling was the slowest model (70.81s) due to repeated negative sampling operations. Skip-gram (full softmax) required 8.65s, while GloVe was the fastest, completing training in 0.92s, even when accounting for co-occurrence matrix construction (total time ≈ 1.24s).

For word analogy evaluation, all three models trained from scratch (Skip-gram, Skip-gram NEG, and GloVe) achieved 0% syntactic and semantic accuracy. This result was expected and can be attributed to the limited size of the Brown news corpus, small window size, and relatively low embedding dimension. These limitations prevent the models from learning robust relational patterns required for analogy tasks.

In contrast, pre-trained GloVe (Gensim) significantly outperformed the custom-trained models. For word similarity evaluation (WordSim353), GloVe (Gensim) achieved the highest Spearman correlation (0.536) with no skipped word pairs, indicating strong alignment with human judgment. The scratch-trained models showed much lower correlations (Skip-gram: 0.277, Skip-gram NEG: 0.042, GloVe: 0.189) and skipped many word pairs due to vocabulary mismatch, suggesting weak semantic consistency.

Overall, these results demonstrate that embedding models trained from scratch on small corpora perform poorly when evaluated on intrinsic benchmarks. However, with larger datasets, higher embedding dimensions, better hyperparameter tuning, and longer training, the performance of these models can be significantly improved. Pre-trained embeddings remain a strong baseline for semantic tasks under limited data conditions.