Testing quality of static (compass) word embeddings (similarity task and analogy task) and temporal (target) embeddings.

In [8]:
import csv

csv_file = 'static_eval.csv'
fieldnames = ['model', 'type', 'slice', '240', '297',
    'capital_mrr', 'capital_mp1', 'capital_mp5', 'capital_mp10',
    'city_mrr', 'city_mp1', 'city_mp5', 'city_mp10',
    'family_mrr', 'family_mp1', 'family_mp5', 'family_mp10',
    'total_mmr', 'total_mp1', 'total_mp5', 'total_mp10']

with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

In [12]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec


def cosine_sim(x,y):
    num = np.dot(x, y)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom


def word_sim(model):
    sim_scores = []

    for sim in ['240', '297']:
        file = open('word_sim/' + sim + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        skip_pair_num = 0
        word_sim_std = []
        word_sim_pre = []
        for line in file:
            word1, word2, val_str = line.strip().split()[0:3]
            if word1 in model.wv and word2 in model.wv:
                test_pair_num += 1
                word_sim_std.append(float(val_str))
                word_vec1 = model.wv[word1]
                word_vec2 = model.wv[word2]
                cos_sim = cosine_sim(word_vec1, word_vec2)
                word_sim_pre.append(cos_sim)
            else:
                skip_pair_num += 1
                # print('Skip:', word1, word2)

        spear_coef, _ = stats.spearmanr(word_sim_std, word_sim_pre)
        sim_scores.append(spear_coef)

    return sim_scores


def reciprocal_rank(neighbours, target_word):
    for rank, word in enumerate(neighbours, start=1):
        if word == target_word:
            return 1.0 / rank
    # If target word not in 'neighbours', return 0
    return 0.0


def precision(neighbours, target_word, k):
    if target_word in neighbours[:k]:
        return 1.0
    return 0.0


def word_analogy(model):
    reciprocal_all = 0
    precision_1_all = 0
    precision_5_all = 0
    precision_10_all = 0
    total_all = 0
    analogy_scores = []

    for ana in ['capital', 'city', 'family']:
        file = open('word_analogy/' + ana + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        reciprocal = 0
        precision_1 = 0
        precision_5 = 0
        precision_10 = 0
        for line in file:
            word1, word2, word3, word4 = line.strip().split()[0:4]
            if word1 in model.wv and word2 in model.wv and word3 in model.wv:
                test_pair_num += 1
                predicted = model.wv.most_similar(positive=[word2, word3], negative=[word1])
                neighbours = [pair[0] for pair in predicted]
                reciprocal += reciprocal_rank(neighbours, word4)
                precision_1 += precision(neighbours, word4, 1)
                precision_5 += precision(neighbours, word4, 5)
                precision_10 += precision(neighbours, word4, 10)
        
        reciprocal_all += reciprocal
        precision_1_all += precision_1
        precision_5_all += precision_5
        precision_10_all += precision_10
        total_all += test_pair_num
        analogy_scores.extend([reciprocal / test_pair_num, 
            precision_1 / test_pair_num,
            precision_5 / test_pair_num,
            precision_10 / test_pair_num])
    
    analogy_scores.extend([reciprocal_all / total_all,
        precision_1_all / total_all,
        precision_5_all / total_all,
        precision_10_all / total_all])
    
    return analogy_scores


def static_eval(model, m_name, m_type, m_slice):
    row = [m_name, m_type, m_slice]
    row.extend(word_sim(model))
    row.extend(word_analogy(model))
    
    with open(csv_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)

In [13]:
for model_type in ['sgns', 'cbow']:
    for year_slice in ['compass', '1954-1978', '1979-2003']:
        model_path = '../../compass/2-slices/model-' + model_type + '/' + year_slice + '.model'
        model = Word2Vec.load(model_path)
        static_eval(model, 'compass', model_type, year_slice)

Test the quality of HistWord style train-aligned embeddings.

In [14]:
for model_type in ['sgns', 'cbow']:
    for year_slice in ['1954-1978', '1979-2003']:
        model_path = '../../alignment/2-slices/model-' + model_type + '/' + year_slice + '.model'
        model = Word2Vec.load(model_path)
        static_eval(model, 'alignment', model_type, year_slice)

Test the quality of 5-year slices.

In [None]:
f = open('1-year/alignment.txt', 'w', encoding='utf-8')
f.write('Testing alignment based Word2Vec\n')
f.write('===========================\n')

for i in range(1946, 2024):
    f.write('Slice ' + str(i) + '\n')
    f.write('---------------\n')
    model_path = '../../alignment/1-year/model/' + str(i) + '.model'
    model = Word2Vec.load(model_path)
    word_sim(model, f)
    f.write('---------------\n')
    word_analogy(model, f)
    f.write('...........................\n')

f.close()