In [None]:
import numpy as np
from collections import Counter, defaultdict
import re
import random
import wandb
import torch
import pandas as pd
from scipy.stats import spearmanr
import torch.nn.functional as F
import os
import torch.nn as nn
import torch.optim as optim

In [None]:
!pip install kaggle --upgrade
os.environ['KAGGLE_USERNAME'] = "XXXXX"
os.environ['KAGGLE_KEY'] = "XXXXXXXXXXXXXX"
!kaggle datasets download julianschelb/wordsim353-crowd
!unzip wordsim353-crowd.zip

In [None]:
class CFG:
  num_captions=1
  seed=42

  window_size=2
  lr=1e-4
  train_epochs=1000

  emb_size=512

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(CFG.seed)

In [None]:
def load_captions(file_path, n):
    captions_by_id = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ', 1)
            if len(parts) != 2:
                continue
            vid_id, caption = parts
            captions_by_id.setdefault(vid_id, []).append(caption)
    selected_captions = []
    for vid_id, captions in captions_by_id.items():
        sampled = random.sample(captions, min(n, len(captions)))
        tokenized = [caption.strip().split() for caption in sampled]
        selected_captions.extend(tokenized)

    return selected_captions

In [None]:
corpus = load_captions("annotations.txt", CFG.num_captions)

In [None]:
len(corpus)

In [None]:
def evaluate_benchmarks(model, word2idx):

  def get_embedding(word, model, word2idx):
    if word not in word2idx:
        raise KeyError(f"word '{word}' not in vocab")
    idx = torch.tensor([word2idx[word]], device="cuda")
    with torch.no_grad():
        emb = model.wi(idx) + model.wj(idx)
    return emb.squeeze(0)

  def cosine_similarity(emb1, emb2):
    emb1 = emb1.unsqueeze(0)
    emb2 = emb2.unsqueeze(0)
    return F.cosine_similarity(emb1, emb2).item()

  simverb_df = pd.read_csv("simverb-3500.csv")
  wordsim_df = pd.read_csv("wordsim353crowd.csv")
  model.eval()
  results = {}

  human_scores = []
  model_scores = []

  for _, row in simverb_df.iterrows():
      w1, w2, human_sim = row['word1'], row['word2'], row['similarity']

      if w1 in word2idx and w2 in word2idx:
          emb1 = get_embedding(w1, model, word2idx)
          emb2 = get_embedding(w2, model, word2idx)

          sim = cosine_similarity(emb1, emb2)
          human_scores.append(human_sim)
          model_scores.append(sim)
      else:
          pass

  corr, _ = spearmanr(human_scores, model_scores)
  results["simverb"] = corr

  for _, row in wordsim_df.iterrows():
      w1, w2, human_sim = row['Word 1'], row['Word 2'], row['Human (Mean)']

      if w1 in word2idx and w2 in word2idx:
          emb1 = get_embedding(w1, model, word2idx)
          emb2 = get_embedding(w2, model, word2idx)

          sim = cosine_similarity(emb1, emb2)
          human_scores.append(human_sim)
          model_scores.append(sim)
      else:
          pass

  corr, _ = spearmanr(human_scores, model_scores)
  results["wordsim"] = corr

  return results

In [None]:
def build_vocab(corpus, min_freq=1):
    counter = Counter()
    for sentence in corpus:
        counter.update(sentence)
    vocab = {w for w, c in counter.items() if c >= min_freq}
    word2idx = {w: i for i, w in enumerate(sorted(vocab))}
    idx2word = {i: w for w, i in word2idx.items()}
    return word2idx, idx2word

def build_cooccurrence(corpus, word2idx, window_size=5):
    cooccurrences = defaultdict(float)
    for sentence in corpus:
        indices = [word2idx[w] for w in sentence if w in word2idx]
        for center_i, center_word in enumerate(indices):
            start = max(0, center_i - window_size)
            end = min(len(indices), center_i + window_size + 1)
            for context_i in range(start, end):
                if context_i != center_i:
                    context_word = indices[context_i]
                    dist = abs(center_i - context_i)
                    cooccurrences[(center_word, context_word)] += 1.0 / dist
    return cooccurrences

def weighting_func(x, x_max=100, alpha=0.75):
    return torch.where(x < x_max, (x / x_max) ** alpha, torch.ones_like(x))

class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.wi = nn.Embedding(vocab_size, embedding_dim)
        self.wj = nn.Embedding(vocab_size, embedding_dim)
        self.bi = nn.Embedding(vocab_size, 1)
        self.bj = nn.Embedding(vocab_size, 1)

        nn.init.xavier_uniform_(self.wi.weight)
        nn.init.xavier_uniform_(self.wj.weight)
        nn.init.zeros_(self.bi.weight)
        nn.init.zeros_(self.bj.weight)

    def forward(self, i_idx, j_idx):
        w_i = self.wi(i_idx)
        w_j = self.wj(j_idx)
        b_i = self.bi(i_idx).squeeze()
        b_j = self.bj(j_idx).squeeze()
        x = (w_i * w_j).sum(dim=1) + b_i + b_j
        return x

def train_glove(corpus, embedding_dim=256, window_size=5, epochs=50, lr=0.005):
    word2idx, idx2word = build_vocab(corpus)
    cooccurrences = build_cooccurrence(corpus, word2idx, window_size)

    i_idx = torch.tensor([i for (i, j) in cooccurrences.keys()], dtype=torch.long)
    j_idx = torch.tensor([j for (i, j) in cooccurrences.keys()], dtype=torch.long)
    counts = torch.tensor([v for v in cooccurrences.values()], dtype=torch.float32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = GloVe(len(word2idx), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    i_idx = i_idx.to(device)
    j_idx = j_idx.to(device)
    counts = counts.to(device)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        total_loss = 0
        preds = model(i_idx, j_idx)
        log_counts = torch.log(counts + 1e-8)
        weights = weighting_func(counts)

        loss = (weights * (preds - log_counts) ** 2).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
            results = evaluate_benchmarks(model, word2idx)
            print("wordsim: ", results["wordsim"])
            print("simverb: ", results["simverb"])

            wandb.log({
              "epoch": epoch+1,
              "train_loss": total_loss,
              "wordsim_corr": results["wordsim"],
              "simverb_corr": results["simverb"]
            })

        embeddings = model.wi.weight.data + model.wj.weight.data


    wandb.finish()

In [None]:
!wandb login
cfg = {k: v for k, v in vars(CFG).items() if not k.startswith('__') and not callable(v)}
wandb.init(project="CLIP", config=cfg)

In [None]:
train_glove(corpus, embedding_dim=CFG.emb_size, epochs=CFG.train_epochs)