In [None]:
import numpy as np
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec or GloVe embeddings
def load_embeddings(embedding_path, binary=True):
    """Load pre-trained embeddings."""
    return KeyedVectors.load_word2vec_format(embedding_path, binary=binary)

# Parse the analogy dataset to extract specific sections
def parse_analogy_dataset(file_path, sections):
    """Extract specific sections (e.g., 'capital-common-countries', 'past-tense') from the analogy dataset."""
    with open(file_path, 'r') as f:
        data = f.readlines()

    extracted_data = []
    capture = False
    for line in data:
        line = line.strip()
        if line.startswith(':'):
            capture = any(section in line for section in sections)
        elif capture:
            extracted_data.append(line.split())
    return extracted_data

# Perform analogy evaluation
def evaluate_analogies(embeddings, analogy_data):
    """Evaluate accuracy of word embeddings on analogy data."""
    correct = 0
    total = len(analogy_data)

    for analogy in analogy_data:
        a, b, c, expected = analogy
        if all(word in embeddings for word in [a, b, c, expected]):
            # Compute vector operation
            result_vector = embeddings[b] - embeddings[a] + embeddings[c]
            # Find the most similar word
            predicted, _ = embeddings.most_similar(positive=[result_vector], topn=1)[0]
            if predicted == expected:
                correct += 1
    return correct, total

# Main function to run evaluation
def main():
    # Paths to files
    analogy_file_path = "word-test.v1.txt"  # Replace with the actual file path
    embedding_path = "path_to_pretrained_embeddings"  # Replace with your embeddings file

    # Sections for evaluation
    semantic_sections = ["capital-common-countries"]
    syntactic_sections = ["past-tense"]

    # Load embeddings
    embeddings = load_embeddings(embedding_path, binary=True)  # Set binary=False for GloVe

    # Extract data
    semantic_data = parse_analogy_dataset(analogy_file_path, semantic_sections)
    syntactic_data = parse_analogy_dataset(analogy_file_path, syntactic_sections)

    # Evaluate on both tasks
    semantic_correct, semantic_total = evaluate_analogies(embeddings, semantic_data)
    syntactic_correct, syntactic_total = evaluate_analogies(embeddings, syntactic_data)

    # Calculate and print accuracies
    semantic_accuracy = (semantic_correct / semantic_total) * 100
    syntactic_accuracy = (syntactic_correct / syntactic_total) * 100

    print(f"Semantic Accuracy ({semantic_sections}): {semantic_accuracy:.2f}%")
    print(f"Syntactic Accuracy ({syntactic_sections}): {syntactic_accuracy:.2f}%")

if __name__ == "__main__":
    main()


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [7]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_accuracy(questions, word_vectors, word2index):
    """
    Calculate accuracy for analogy questions.
    
    Args:
        questions (list): List of analogy questions in format 'a b c d'.
        word_vectors (np.array): Pre-trained word vectors.
        word2index (dict): Mapping of words to their indices in `word_vectors`.
    
    Returns:
        float: Accuracy as a percentage.
    """
    correct = 0
    total = 0

    for question in questions:
        try:
            a, b, c, d = question.split()
            if a in word2index and b in word2index and c in word2index and d in word2index:
                vec_a = word_vectors[word2index[a]]
                vec_b = word_vectors[word2index[b]]
                vec_c = word_vectors[word2index[c]]
                
                # Compute vector: b - a + c
                target_vector = vec_b - vec_a + vec_c
                # Find most similar word
                similarities = cosine_similarity(target_vector.reshape(1, -1), word_vectors)
                predicted_index = np.argmax(similarities)
                predicted_word = list(word2index.keys())[predicted_index]

                if predicted_word == d:
                    correct += 1
                total += 1
        except Exception as e:
            continue  # Skip malformed questions or missing words

    return (correct / total) * 100 if total > 0 else 0

# Load the dataset and split into semantic and syntactic subsets
def load_dataset(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()

    semantic_questions = []
    syntactic_questions = []
    current_section = None  # Initialize current_section

    for line in lines:
        if line.startswith(": capital-common-countries"):
            current_section = "semantic"
        elif line.startswith(": past-tense"):
            current_section = "syntactic"
        elif line.startswith(":"):
            current_section = None  # Reset for unrelated sections
        elif current_section == "semantic":
            semantic_questions.append(line.strip())
        elif current_section == "syntactic":
            syntactic_questions.append(line.strip())
    
    return semantic_questions, syntactic_questions





In [8]:
# Example usage
word_vectors = np.random.rand(1000, 300)  # Replace with actual embeddings
word2index = {f"word{i}": i for i in range(1000)}  # Replace with actual vocabulary

semantic_questions, syntactic_questions = load_dataset("word-test.v1.txt")

semantic_accuracy = calculate_accuracy(semantic_questions, word_vectors, word2index)
syntactic_accuracy = calculate_accuracy(syntactic_questions, word_vectors, word2index)

print(f"Semantic Accuracy: {semantic_accuracy:.2f}%")
print(f"Syntactic Accuracy: {syntactic_accuracy:.2f}%")

Semantic Accuracy: 0.00%
Syntactic Accuracy: 0.00%
