<a href="https://colab.research.google.com/github/ParvG2005/Parv/blob/main/glove_trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import re
from collections import Counter, defaultdict
from tqdm import tqdm, trange

# -----------------------------
# 1. Load & preprocess dataset
# -----------------------------
def load_tokens(filename, min_count=3):
    with open(filename, encoding="utf-8") as f:
        text = f.read().lower()
    tokens = re.findall(r"\b\w+\b", text)

    # Replace rare words with <UNK>
    freq = Counter(tokens)
    tokens = [t if freq[t] >= min_count else "<UNK>" for t in tokens]
    return tokens

train_tokens = load_tokens("wiki_movie_plots_deduped.csv")
valid_tokens = load_tokens("english_15000.txt")
test_tokens  = load_tokens("english_test.txt")

# Vocabulary
vocab = list(set(train_tokens))
word2id = {w:i for i,w in enumerate(vocab)}
id2word = {i:w for w,i in word2id.items()}
V = len(vocab)
print("started training")
# -----------------------------
# 2. Build co-occurrence matrix
# -----------------------------
def build_cooccurrence(tokens, window_size=5):
    cooccurrence = defaultdict(float)
    for idx, word in enumerate(tokens):
        if word not in word2id:   # safety
            continue
        w_id = word2id[word]
        start = max(0, idx - window_size)
        end   = min(len(tokens), idx + window_size + 1)
        for j in range(start, end):
            if j != idx and tokens[j] in word2id:
                c_id = word2id[tokens[j]]
                cooccurrence[(w_id, c_id)] += 1.0 / abs(j - idx)
    return cooccurrence

cooc = build_cooccurrence(train_tokens)
print("started training")
# -----------------------------
# 3. GloVe model training
# -----------------------------
def glove_train_optimized(cooc, vector_size=50, iterations=50, x_max=100, alpha=0.75, lr=0.05, batch_size=2048):
    i_ids, j_ids, Xij = zip(*[(i,j,x) for (i,j),x in cooc.items()])
    i_ids, j_ids, Xij = np.array(i_ids), np.array(j_ids), np.array(Xij, dtype=np.float32)
    logX = np.log(Xij)

    # Initialize parameters
    W = np.random.randn(V, vector_size) / np.sqrt(vector_size)
    W_tilde = np.random.randn(V, vector_size) / np.sqrt(vector_size)
    b = np.zeros(V)
    b_tilde = np.zeros(V)

    # AdaGrad accumulators
    grad_sq_W = np.ones_like(W)
    grad_sq_Wt = np.ones_like(W_tilde)
    grad_sq_b = np.ones_like(b)
    grad_sq_bt = np.ones_like(b_tilde)

    n_pairs = len(Xij)

    for it in range(iterations):
        perm = np.random.permutation(n_pairs)
        total_loss = 0.0

        for start in range(0, n_pairs, batch_size):
            end = min(start+batch_size, n_pairs)
            idx = perm[start:end]

            i, j, x, lx = i_ids[idx], j_ids[idx], Xij[idx], logX[idx]

            # f(x)
            f = np.where(x < x_max, (x/x_max)**alpha, 1.0)

            # Predictions
            dot = np.sum(W[i] * W_tilde[j], axis=1) + b[i] + b_tilde[j]
            cost = dot - lx
            total_loss += np.sum(0.5 * f * cost**2)

            # Gradients
            grad_common = f * cost
            grad_Wi = grad_common[:, None] * W_tilde[j]
            grad_Wj = grad_common[:, None] * W[i]
            grad_bi = grad_common
            grad_bj = grad_common

            # AdaGrad updates
            W[i]       -= (lr / np.sqrt(grad_sq_W[i])) * grad_Wi
            W_tilde[j] -= (lr / np.sqrt(grad_sq_Wt[j])) * grad_Wj
            b[i]       -= (lr / np.sqrt(grad_sq_b[i])) * grad_bi
            b_tilde[j] -= (lr / np.sqrt(grad_sq_bt[j])) * grad_bj

            # Accumulate squared gradients
            grad_sq_W[i]       += grad_Wi**2
            grad_sq_Wt[j]      += grad_Wj**2
            grad_sq_b[i]       += grad_bi**2
            grad_sq_bt[j]      += grad_bj**2

        print(f"Iteration {it+1}, Loss={total_loss/len(Xij):.4f}")

    return W + W_tilde
print("started training")
# Train model
embeddings = glove_train_optimized(cooc, vector_size=50, iterations=10)

# -----------------------------
# 4. Perplexity evaluation
# -----------------------------
def perplexity(tokens, embeddings, word2id, window_size=2):
    N = len(tokens)
    log_prob = 0
    count = 0
    for i in range(window_size, N):
        context_ids = [word2id.get(tokens[j], word2id["<UNK>"]) for j in range(i-window_size, i)]
        target_id = word2id.get(tokens[i], word2id["<UNK>"])

        context_vec = np.mean([embeddings[c] for c in context_ids], axis=0)

        # Stable softmax
        scores = embeddings @ context_vec
        scores -= np.max(scores)
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores)

        log_prob += np.log(probs[target_id] + 1e-10)
        count += 1

    return np.exp(-log_prob / count)
print("started training")
val_perp = perplexity(valid_tokens, embeddings, word2id)
test_perp = perplexity(test_tokens, embeddings, word2id)

print("Validation Perplexity:", val_perp)
print("Test Perplexity:", test_perp)


started training
started training
started training
Iteration 1, Loss=0.0386
Iteration 2, Loss=0.0291
Iteration 3, Loss=0.0225
Iteration 4, Loss=0.0177
Iteration 5, Loss=0.0147
Iteration 6, Loss=0.0130
Iteration 7, Loss=0.0120
Iteration 8, Loss=0.0113
Iteration 9, Loss=0.0107
Iteration 10, Loss=0.0103
started training
Validation Perplexity: 6948.649944028533
Test Perplexity: 5756.309410452827
