# Name : Suryansh Srivastava
# ID   : 124997

## Task 2


In [2]:
import numpy as np
import pickle

In [4]:
import sys
import os
sys.path.append(os.path.abspath('../'))
from models.skipgram import Skipgram
from models.skipgram_negSampling import SkipgramNeg
from models.glove import Glove

In [5]:
# Load models from pickle files
def load_model(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

In [7]:
models = {
    "Skipgram": "./models/skipgram_model.pkl",
    "SkipgramNEG": "./models/skipgram_negSampling_model.pkl",
    "Glove": "./models/glove_model.pkl",
    "GloveGensim": "./models/glove_gensim_model.pkl"
}

In [8]:
# Load models
loaded_models = {name: load_model(path) for name, path in models.items()}

In [24]:
with open('./models/glove_word2index.pkl', 'rb') as f:
    word2index = pickle.load(f)

with open('./models/glove_index2word.pkl', 'rb') as f:
    index2word = pickle.load(f)

center_embeddings = loaded_models["Glove"].center_embedding.weight.data
outside_embeddings = loaded_models["Glove"].outside_embedding.weight.data

word_to_idx = word2index
idx_to_word = index2word

In [25]:
# Word analogy dataset url
wordAnalogy_url = "https://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt"

In [26]:
import requests
# syntactic 
def load_word_analogy_data_for_syntactic_accuracy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': gram7-past-tense'
    section_end = ': gram8-plural'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

In [27]:
# semantic
def load_word_analogy_data_for_semantic_accuracy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': capital-common-countries'
    section_end = ': currency'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

In [28]:
syntactic_analogy_data = load_word_analogy_data_for_syntactic_accuracy(wordAnalogy_url)
semantic_analogy_data = load_word_analogy_data_for_semantic_accuracy(wordAnalogy_url)

In [33]:
import torch
def predict_analogy(word_a, word_b, word_c, embeddings, word_to_idx, idx_to_word):
    try:
        vec_a = embeddings[word_to_idx[word_a]]
        vec_b = embeddings[word_to_idx[word_b]]
        vec_c = embeddings[word_to_idx[word_c]]
        target_vec = vec_b - vec_a + vec_c

        similarities = torch.matmul(embeddings, target_vec) / (
            torch.norm(embeddings, dim=1) * torch.norm(target_vec) + 1e-8
        )
        best_match_idx = torch.argmax(similarities).item()
        return idx_to_word[best_match_idx]
    except KeyError as e:
        return None  # Return None if any word is not in the vocabulary

In [34]:
def evaluate_semantic_accuracy(analogy_data, embeddings, word_to_idx, idx_to_word):
    correct = 0
    total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        predicted_word = predict_analogy(word_a, word_b, word_c, embeddings, word_to_idx, idx_to_word)

        if predicted_word == word_d:
            correct += 1

        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [35]:
def evaluate_syntactic_accuracy(analogy_data, embeddings, word_to_idx, idx_to_word):
    syntactic_correct = 0
    syntactic_total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        # Process syntactic relationships directly from the dataset
        if word_a.endswith("ing") or word_a.endswith("ed"):
            predicted_word = predict_analogy(word_a, word_b, word_c, embeddings, word_to_idx, idx_to_word)

            if predicted_word == word_d:
                syntactic_correct += 1

            syntactic_total += 1

    syntactic_accuracy = syntactic_correct / syntactic_total if syntactic_total > 0 else 0
    return syntactic_accuracy

In [38]:
syntactic_accuracy = evaluate_syntactic_accuracy(syntactic_analogy_data, center_embeddings, word_to_idx, idx_to_word)
semantic_accuracy = evaluate_semantic_accuracy(semantic_analogy_data, center_embeddings, word_to_idx, idx_to_word)

print(f"Syntactic Accuracy: {syntactic_accuracy * 100:.2f}%")
print(f"Semantic Accuracy: {semantic_accuracy * 100:.2f}%")

Syntactic Accuracy: 0.00%
Semantic Accuracy: 0.00%


### 1.Compare Skip-gram, Skip-gram negative sampling, GloVe models on training loss, training time. 

- From the four notebooks i.e. 01 - Word2Vec (Skipgram).ipynb, 02 - Word2Vec (Neg Sampling).ipynb, 03 - GloVe from Scratch.ipynb and 04 - GloVe (Gensim).ipynb which are used to train their respective models, we observe the training loss and training time for each model



| Model            | Window Size | Training Loss | Training time | Syntactic Accuracy | Semantic Accuracy |
|-----------------|-------------|--------------|---------------|--------------------|-------------------|
| Skipgram       |             |              |               |                    |                   |
| Skipgram (NEG) |             |              |               |                    |                   |
| Glove          |             |              |               |                    |                   |
| Glove (Gensim) |             |      -       |       -       |                    |                   |
