In [32]:
import os
import numpy as np

In [33]:
def train(data_path, result_path, mode="hs_cbow"):
    data_path = os.path.join(data_path, 'text8')
    train_cmd = f"./word2vec -train {data_path}"
    hyperparam = "-size 200 -window 5 -min-count 5 -threads 16 -iter 20 -binary 1"
    # HS_CBOW
    if mode == 'hs_cbow':
        hs_cbow_path = os.path.join(result_path, 'hs_cbow.bin')
        os.system(f"{train_cmd} -output {hs_cbow_path} -cbow 1 -hs 1 -negative 0 {hyperparam}")
        print("\nHS_CBOW training done.")
    elif mode == 'hs_sg':
        hs_sg_path = os.path.join(result_path, 'hs_sg.bin')
        os.system(f"{train_cmd} -output {hs_sg_path} -cbow 0 -hs 1 -negative 0 {hyperparam}")
        print("\nHS_SG training done.")
    elif mode == 'ns_cbow':
        ns_cbow_path = os.path.join(result_path, 'ns_cbow.bin')
        os.system(f"{train_cmd} -output {ns_cbow_path} -cbow 1 -hs 0 -negative 5 {hyperparam}")
        print("\nNS_CBOW training done.")
    elif mode == 'ns_sg':
        ns_sg_path = os.path.join(result_path, 'ns_sg.bin')
        os.system(f"{train_cmd} -output {ns_sg_path} -cbow 0 -hs 0 -negative 5 {hyperparam}")
        print("\nNS_SG training done.")
    else:
        print("Invalid mode. Please choose from 'hs_cbow', 'hs_sg', 'ns_cbow', 'ns_sg'.")

In [34]:
train("../data", "../results", "hs_cbow")

Starting training using file ../data/text8
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.000005  Progress: 100.07%  Words/thread/sec: 119.28k  
HS_CBOW training done.


In [35]:
train("../data", "../results", "hs_sg")

Starting training using file ../data/text8
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.000002  Progress: 100.07%  Words/thread/sec: 50.20k  
HS_SG training done.


In [36]:
train("../data", "../results", "ns_cbow")

Starting training using file ../data/text8
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.036751  Progress: 26.50%  Words/thread/sec: 141.85k  

Alpha: 0.000005  Progress: 100.07%  Words/thread/sec: 136.71k  
NS_CBOW training done.


In [37]:
train("../data", "../results", "ns_sg")

Starting training using file ../data/text8
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.021993  Progress: 12.03%  Words/thread/sec: 49.48k  

Alpha: 0.000002  Progress: 100.07%  Words/thread/sec: 47.65k  
NS_SG training done.


In [22]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

In [23]:
def spearman_rank_correlation(x, y):
    ranks_x = np.argsort(np.argsort(x))
    ranks_y = np.argsort(np.argsort(y))
    correlation = np.corrcoef(ranks_x, ranks_y)[0, 1]
    return correlation

In [24]:
def load_word_vectors(name, path):
    vocab_size = 0
    vector_size = 0
    word_vectors = {}
    with open(os.path.join(path, name), 'r') as file:
        # Read the first line to get the vocabulary size and vector size
        line = file.readline().strip()
        vocab_size, vector_size = map(int, line.split())
        # Read the word vectors
        for line in file:
            word, *vector = line.strip().split()
            word_vectors[word] = np.array(vector, dtype=float)
    return word_vectors

In [25]:
def load_target_wordsim(path):
    target_wordsim = []
    with open(path, 'r') as file:
        for line in file:
            word1, word2, score = line.strip().split()
            target_wordsim.append((word1, word2, float(score)))
    return target_wordsim

In [26]:
def evaluate(pred_path, target_path, name):
    pred_word_vectors = load_word_vectors(name, pred_path)
    target_wordsim = load_target_wordsim(target_path)
    cosine_similarities = []
    target_scores = []
    for word1, word2, score in target_wordsim:
        if word1 in pred_word_vectors and word2 in pred_word_vectors:
            vec1 = pred_word_vectors[word1]
            vec2 = pred_word_vectors[word2]
            cosine_similarities.append(cosine_similarity(vec1, vec2))
            target_scores.append(score)
    spearman_correlation = spearman_rank_correlation(np.array(cosine_similarities), np.array(target_scores))
    print(f"Spearman correlation for {name}: {spearman_correlation}")

In [29]:
pred_path = "../results"
target_path = "../data/wordsim_similarity_goldstandard.txt"
evaluate(pred_path, target_path, "hs_cbow.txt")
evaluate(pred_path, target_path, "hs_sg.txt")
evaluate(pred_path, target_path, "ns_cbow.txt")
evaluate(pred_path, target_path, "ns_sg.txt")

Spearman correlation for hs_cbow.txt: 0.680260240495881
Spearman correlation for hs_sg.txt: 0.7302820890449755
Spearman correlation for ns_cbow.txt: 0.7043794202851639
Spearman correlation for ns_sg.txt: 0.7483095696645032
