# Name : Suryansh Srivastava
# ID   : 124997

## Task 2


In [1]:
import numpy as np
import pickle

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../'))

# importing classes for the respective models
from models.skipgram import Skipgram
from models.skipgram_negSampling import SkipgramNeg
from models.glove import Glove

In [3]:
# Load pickle files
def load(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

In [4]:
# models name and path

models = {
    "Skipgram": "./models/skipgram_model.pkl",
    "SkipgramNEG": "./models/skipgram_negSampling_model.pkl",
    "Glove": "./models/glove_model.pkl",
    "GloveGensim": "./models/glove_gensim_model.pkl"
}

In [5]:
# index2word name and path for the respective models
index2word={
    "Skipgram": "./models/skipgram_index2word.pkl",
    "SkipgramNEG": "./models/skipgram_negSampling_index2word.pkl",
    "Glove": "./models/glove_index2word.pkl",
}

In [6]:
# word2index name and path for the respective models
word2index={
    "Skipgram": "./models/skipgram_word2index.pkl",
    "SkipgramNEG": "./models/skipgram_negSampling_word2index.pkl",
    "Glove": "./models/glove_word2index.pkl",
}

In [7]:
# Load models
loaded_models = {name: load(path) for name, path in models.items()}

In [8]:
# Load index2word
loaded_index2word = {name: load(path) for name, path in index2word.items()}

In [9]:
# load word2index
loaded_word2index = {name: load(path) for name, path in word2index.items()}

In [10]:
# Word analogy dataset url
wordAnalogy_url = "https://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt"

In [11]:
import requests

# syntactic 
def fetch_data_syntactic_analogy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': gram7-past-tense'
    section_end = ': gram8-plural'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

In [12]:
# semantic
def fetch_data_semantic_analogy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': capital-common-countries'
    section_end = ': currency'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

In [13]:
syntactic_analogy_data = fetch_data_syntactic_analogy(wordAnalogy_url)
semantic_analogy_data = fetch_data_semantic_analogy(wordAnalogy_url)

In [14]:
import torch

def predict_analogy(model_name,word_a, word_b, word_c, embeddings=None, word_to_idx=None, idx_to_word=None):
    if(model_name == "GloveGensim"):
        result = loaded_models['GloveGensim'].most_similar(positive=[word_c, word_b], negative=[word_a])
        return result[0][0]

    try:
        vec_a = embeddings[word_to_idx[word_a]]
        vec_b = embeddings[word_to_idx[word_b]]
        vec_c = embeddings[word_to_idx[word_c]]
        target_vec = vec_b - vec_a + vec_c

        similarities = torch.matmul(embeddings, target_vec) / (
            torch.norm(embeddings, dim=1) * torch.norm(target_vec) + 1e-8
        )
        best_match_idx = torch.argmax(similarities).item()
        return idx_to_word[best_match_idx]
    except KeyError as e:
        return None  # Return None if any word is not in the vocabulary
    

In [15]:
def semantic_accuracy(model_name,analogy_data, embeddings=None, word_to_idx=None, idx_to_word=None):
    correct = 0
    total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        predicted_word=None
        if(model_name == "GloveGensim"):
            try:
                predicted_word = predict_analogy(model_name,word_a, word_b, word_c)
            except:
                predicted_word = None
        else:
            predicted_word = predict_analogy(model_name,word_a, word_b, word_c, embeddings, word_to_idx, idx_to_word)

        if predicted_word == word_d:
            correct += 1

        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [16]:
def syntactic_accuracy(model_name,analogy_data, embeddings=None, word_to_idx=None, idx_to_word=None):
    correct = 0
    total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        # Process syntactic relationships directly from the dataset
        if word_a.endswith("ing") or word_a.endswith("ed"):
            predicted_word=None
            if(model_name == "GloveGensim"):
                try:
                    predicted_word = predict_analogy(model_name,word_a, word_b, word_c)
                except:
                    predicted_word = None
            else:
                predicted_word = predict_analogy(model_name,word_a, word_b, word_c, embeddings, word_to_idx, idx_to_word)

            if predicted_word == word_d:
                correct += 1

            total += 1

    syntactic_accuracy = correct / total if total > 0 else 0
    return syntactic_accuracy

In [17]:
# for model_name, model in loaded_models.items():
#     syntactic_acc = None
#     semantic_acc = None
#     if(model_name == "GloveGensim"):
#         syntactic_acc = syntactic_accuracy(model_name,syntactic_analogy_data)
#         semantic_acc = semantic_accuracy(model_name,semantic_analogy_data)
#     else:
            
#         if(model_name == "Glove"):
#             center_embeddings = model.center_embedding.weight.data
#             outside_embeddings = model.outside_embedding.weight.data
#         else:
#             center_embeddings = model.embedding_center.weight.data
#             outside_embeddings = model.embedding_outside.weight.data
        
#         word_to_idx = loaded_word2index[model_name]
#         idx_to_word = loaded_index2word[model_name]

#         syntactic_acc = syntactic_accuracy(model_name,syntactic_analogy_data, center_embeddings, word_to_idx, idx_to_word)
#         semantic_acc = semantic_accuracy(model_name,semantic_analogy_data, center_embeddings, word_to_idx, idx_to_word)
    
#     print(f"{model_name} Model")
#     print(f"Syntactic Accuracy: {syntactic_acc * 100:.2f}%")
#     print(f"Semantic Accuracy: {semantic_acc * 100:.2f}%")
#     print("\n")

In [18]:
with open('./models/glove_gensim_model.pkl', 'rb') as f:
    model= pickle.load(f)

In [19]:
def predict_analogy(word_a, word_b, word_c):
    print("Inside predict_analogy")
    result = model.most_similar(positive=[word_c, word_b], negative=[word_a])
    return result[0][0]

In [20]:
def evaluate_semantic_accuracy(analogy_data):
    correct = 0
    total = 0
    print("Inside evaluate_semantic_accuracy")
    for question in analogy_data:
        if len(question) != 4:
            continue
        print("Inside for loop")
        word_a, word_b, word_c, word_d = question
        try:
            print("Inside try")
            predicted_word = predict_analogy(word_a, word_b, word_c)
        except:
            predicted_word = None

        if predicted_word == word_d:
            correct += 1

        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [21]:
def evaluate_syntactic_accuracy(analogy_data):
    syntactic_correct = 0
    syntactic_total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        # Process syntactic relationships directly from the dataset
        if word_a.endswith("ing") and word_b.endswith("ed") and word_c.endswith("ing") and word_d.endswith("ed"):
            try:
                predicted_word = predict_analogy(word_a, word_b, word_c)
            except:
                predicted_word = None

            if predicted_word == word_d:
                syntactic_correct += 1

            syntactic_total += 1

    syntactic_accuracy = syntactic_correct / syntactic_total if syntactic_total > 0 else 0
    return syntactic_accuracy

: 

In [None]:
semantic_accuracy = evaluate_semantic_accuracy(semantic_analogy_data)

Inside evaluate_semantic_accuracy
Inside for loop
Inside try
Inside predict_analogy
Inside for loop
Inside try
Inside predict_analogy
Inside for loop
Inside try
Inside predict_analogy
Inside for loop
Inside try
Inside predict_analogy


In [None]:
# syntactic_accuracy = evaluate_syntactic_accuracy(syntactic_analogy_data)


# print(f"Syntactic Accuracy: {syntactic_accuracy * 100:.2f}%")
# print(f"Semantic Accuracy: {semantic_accuracy * 100:.2f}%")

### 1.Compare Skip-gram, Skip-gram negative sampling, GloVe models on training loss, training time. 

- From the four notebooks i.e. 01 - Word2Vec (Skipgram).ipynb, 02 - Word2Vec (Neg Sampling).ipynb, 03 - GloVe from Scratch.ipynb and 04 - GloVe (Gensim).ipynb which are used to train their respective models, we observe the training loss and training time for each model



| Model            | Window Size | Training Loss | Training time | Syntactic Accuracy | Semantic Accuracy |
|-----------------|-------------|--------------|---------------|--------------------|-------------------|
| Skipgram       |             |              |               |                    |                   |
| Skipgram (NEG) |             |              |               |                    |                   |
| Glove          |             |              |               |                    |                   |
| Glove (Gensim) |             |      -       |       -       |                    |                   |
