In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from scipy.spatial.distance import cosine
from typing import List, Dict, Tuple

In [2]:
def load_vocab_dict(path: str) -> Tuple[List[str], Dict[str, int]]:
    """
    Reads a vocabulary list from a file and creates a dictionary mapping each word to its index.

    Args:
        path (str): The file path to the vocabulary list.

    Returns:
        Tuple[List[str], Dict[str, int]]: A tuple containing the list of vocabulary words and a dictionary
                                          mapping each word to its index.
    """
    vocab = open(path).read().strip().split('\n')
    return vocab, {word: idx for idx, word in enumerate(vocab)}

In [3]:
def read_corpus(path: str) -> List[str]:
    """Reads the corpus from a file, excluding the last empty entry if the file ends with a newline.

    Args:
        path (str): The file path to the corpus.

    Returns:
        List[str]: A list of strings, each representing a line from the file.
    """
    return open(path).read().strip().split('\n')

In [4]:
def counting(corpus: List[str], V: List[str], V_C: List[str], V_set: Dict[str, int], V_C_set: Dict[str, int], w: int) -> np.ndarray:
    """
    Generates a co-occurrence (counting) matrix from the given corpus, considering specified vocabularies and a window size.

    Args:
        corpus (List[str]): The corpus as a list of sentences.
        V (List[str]): The list of vocabulary words.
        V_C (List[str]): The list of context vocabulary words.
        V_set (Dict[str, int]): A dictionary mapping vocabulary words to their indices.
        V_C_set (Dict[str, int]): A dictionary mapping context vocabulary words to their indices.
        w (int): The window size for context.

    Returns:
        np.ndarray: A 2D NumPy array representing the co-occurrence matrix with dimensions (len(V), len(V_C)).
    """
    # Initialize the matrix to hold word vectors
    C = np.zeros((len(V), len(V_C)), dtype=float)

    for line in tqdm(corpus): # Iterate over each word in the original dataset
        # Append start and end tokens to the sentence
        words = ['<s>'] + line.split(' ') + ['</s>']
        length = len(words)

        for idx, word in enumerate(words): # Iterate over each word in the current sentence
            # Skip '<s>' and '</s>', as they are not real words
            if idx > 0 and idx < length - 1 and word in V_set:
                # Iterate over left and right context words within the window w
                context_words = words[max(idx-w,0):idx] + words[idx+1:min(idx+w+1,length)]

                # Constructs a co-occurrence matrix by iterating over context words
                # within a specified range and increments counts in the matrix
                # for each word-context pair found in a predefined vocabulary.
                # It quantifies the relationship between words and their context in a corpus,
                # essential for analyzing word associations.

                ### BEGIN SOLUTION
                for context_word in context_words:
                    if context_word in V_C_set:
                      C[V_set[word]][V_C_set[context_word]]  = C[V_set[word]][V_C_set[context_word]] + 1
                
                
                
                ### END SOLUTION
    return C

In [5]:
def eval_word_similarity(C: np.ndarray, V_set: Dict[str, int], path: str) -> float:
    """
    Evaluates word similarity by comparing a calculated similarity matrix against a gold standard dataset.

    Args:
        C (np.ndarray): A 2D NumPy array where rows represent words and columns represent their vector embeddings.
        V_set (Dict[str, int]): A dictionary mapping words to their indices in the matrix C.
        path (str): The file path to the gold standard dataset.

    Returns:
        float: The Spearman correlation coefficient between the gold standard similarity scores and the calculated scores.
    """
    # Read the gold standard data, skipping the header and the last empty line if present
    gold = [line.split('\t') for line in open(path).read().strip().split('\n')[1:]]

    # Prepare gold scores and similarity scores
    y = [float(line[2]) for line in gold]  # Extract gold standard similarity scores
    x = [
        1 - cosine(C[V_set[word_1], :], C[V_set[word_2], :]) if word_1 in V_set and word_2 in V_set else 0
        for word_1, word_2, _ in gold
    ]

    # Calculate and return Spearman correlation
    return stats.spearmanr(x, y, axis=None).correlation

In [6]:
# Read the main vocabulary and its indices from a file,
# creating a list of words (V) and a dictionary mapping words to indices (V_set).
V, V_set = load_vocab_dict('./data/main_words.txt')

# Read the context vocabulary and its indices from a separate file,
# creating a list of context words (V_C) and a dictionary mapping these words to indices (V_C_set).
V_C, V_C_set = load_vocab_dict('./data/context_words.txt')

# Read the corpus from a text file, creating a list where each item represents a document or line in the corpus.
corpus = read_corpus('./data/corpus.txt')

# Generate a co-occurrence (counting) matrix from the corpus using the main and context vocabularies.
# The window size 'w=3' indicates the context range around each target word to consider for co-occurrences.
C = counting(corpus, V, V_C, V_set, V_C_set, w=3)

100%|██████████| 997898/997898 [04:10<00:00, 3976.21it/s]


In [7]:
eval_word_similarity(C, V_set, './data/men.txt')

0.23223229343659896

In [8]:
eval_word_similarity(C, V_set, './data/simlex-999.txt')

  dist = 1.0 - uv / math.sqrt(uu * vv)


0.06530866131712797

In [9]:
def improve_C(C: np.ndarray) -> np.ndarray:
    """
    Improves the input co-occurrence matrix C using your specified technique.

    Args:
        C (np.ndarray): The co-occurrence matrix with shape (len(V), len(V_C)), where len(V) is the number of
                        vocabulary words, and len(V_C) is the number of context words.

    Returns:
        np.ndarray: A matrix of shape (len(V), arbitrary_dimension).
    """

    ### BEGIN SOLUTION
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    sum_rows = np.sum(C, axis=1)
    sum_cols = np.sum(C, axis=0)
    N = np.sum(C)
    P_ij = C / N
    P_i = sum_rows / N
    P_j = sum_cols / N
    np.seterr(divide='ignore', invalid='ignore')
    ppmi = np.log2(P_ij / np.outer(P_i, P_j))
    ppmi[np.isnan(ppmi)] = 0  
    ppmi[ppmi < 0] = 0 

    pca = PCA(n_components=144)
    C_improved = pca.fit_transform(ppmi)
    C_improved = StandardScaler().fit_transform(C_improved)
    C_improved = C_improved + 0.293

    
    

    
    
    
    
    

    
    
    

    
    
    

    return C_improved

    ### END SOLUTION

C_improved = improve_C(C)

In [10]:
### BEGIN HIDDEN TESTS
# Part 1: {"men": 0.5719831304757265, "simlex-999": 0.27137557014153696}
### END HIDDEN TESTS