Testing quality of static (compass) word embeddings (similarity task and analogy task) and temporal (target) embeddings.

In [32]:
import numpy as np
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec


def cosine_sim(x,y):
    num = np.dot(x, y)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom


def word_sim(model):
    for sim in ['240', '297']:
        file = open('word_sim/' + sim + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        skip_pair_num = 0
        word_sim_std = []
        word_sim_pre = []
        for line in file:
            word1, word2, val_str = line.strip().split()[0:3]
            if word1 in model.wv and word2 in model.wv:
                test_pair_num += 1
                word_sim_std.append(float(val_str))
                word_vec1 = model.wv[word1]
                word_vec2 = model.wv[word2]
                cos_sim = cosine_sim(word_vec1, word_vec2)
                word_sim_pre.append(cos_sim)
            else:
                skip_pair_num += 1
                # print('Skip:', word1, word2)

        spear_coef, _ = stats.spearmanr(word_sim_std, word_sim_pre)
        print("{}: Spearman Score: {:.4f}".format(sim, spear_coef))
        print("  Test pair: {}, Skip pair: {}".format(test_pair_num, skip_pair_num))


def word_analogy(model):
    top_one_all = 0
    top_five_all = 0
    total_all = 0

    for ana in ['capital', 'city', 'family']:
        file = open('word_analogy/' + ana + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        skip_pair_num = 0
        top_one = 0
        top_five = 0
        for line in file:
            word1, word2, word3, word4 = line.strip().split()[0:4]
            if word1 in model.wv and word2 in model.wv and word3 in model.wv:
                test_pair_num += 1
                predicted = model.wv.most_similar(positive=[word2, word3], negative=[word1])[0][0]
                if predicted == word4:
                    top_one += 1
                else:
                    # print('Target: ' + word4 + ', prediction: ' + predicted)
                    five = model.wv.most_similar(positive=[word2, word3], negative=[word1])[:5]
                    if word4 in [pair[0] for pair in five]:
                        top_five += 1
            else:
                skip_pair_num += 1
                # print('Skip:', word1, word2, word3, word4)
            
        print("{}: Accuracy: {:.4f}".format(ana, top_one / test_pair_num))
        print("  Top five: {:.4f}".format((top_one + top_five) / test_pair_num))
        # print("  TestPair: {}, SkipPair: {}".format(test_pair_num, skip_pair_num))
        top_one_all += top_one
        top_five_all += top_five
        total_all += test_pair_num
    
    print("total: Accuracy: {:.4f}".format(top_one_all / total_all))
    print("  Top five: {:.4f}".format((top_one_all + top_five_all) / total_all))

In [15]:
model_sgns = Word2Vec.load('../../compass/2-slices/model-sgns/compass.model')

word_sim(model_sgns)

240: Spearman Score: 0.5646
  Test pair: 231, Skip pair: 9
297: Spearman Score: 0.5852
  Test pair: 284, Skip pair: 13


In [14]:
model_cbow = Word2Vec.load('../../compass/2-slices/model-cbow/compass.model')

word_sim(model_cbow)

240: Spearman Score: 0.4722
  Test pair: 231, Skip pair: 9
297: Spearman Score: 0.5775
  Test pair: 284, Skip pair: 13


In [33]:
word_analogy(model_sgns)

capital: Accuracy: 0.8227
  Top five: 0.9793
city: Accuracy: 0.9371
  Top five: 0.9943
family: Accuracy: 0.6985
  Top five: 0.8824
total: Accuracy: 0.8105
  Top five: 0.9582


In [34]:
word_analogy(model_cbow)

capital: Accuracy: 0.8109
  Top five: 0.9690
city: Accuracy: 0.8400
  Top five: 0.9600
family: Accuracy: 0.5515
  Top five: 0.7941
total: Accuracy: 0.7527
  Top five: 0.9253


In [40]:
for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    for year_slice in ['1954-1978', '1979-2003']:
        print('Slice ' + year_slice)
        print('---------------')
        model_path = '../../compass/2-slices/model-' + model_type + '/' + year_slice + '.model'
        model = Word2Vec.load(model_path)
        word_sim(model)
        print('---------------')
        word_analogy(model)
        print('...........................')

Testing sgns based Word2Vec
Slice 1954-1978
---------------
240: Spearman Score: 0.5178
  Test pair: 224, Skip pair: 16
297: Spearman Score: 0.5166
  Test pair: 274, Skip pair: 23
---------------
capital: Accuracy: 0.5820
  Top five: 0.7962
city: Accuracy: 0.6686
  Top five: 0.9086
family: Accuracy: 0.4154
  Top five: 0.6544
total: Accuracy: 0.5552
  Top five: 0.7794
...........................
Slice 1979-2003
---------------
240: Spearman Score: 0.5581
  Test pair: 231, Skip pair: 9
297: Spearman Score: 0.5967
  Test pair: 284, Skip pair: 13
---------------
capital: Accuracy: 0.8567
  Top five: 0.9749
city: Accuracy: 0.9657
  Top five: 1.0000
family: Accuracy: 0.6691
  Top five: 0.8199
total: Accuracy: 0.8283
  Top five: 0.9413
...........................
Testing cbow based Word2Vec
Slice 1954-1978
---------------
240: Spearman Score: 0.4352
  Test pair: 224, Skip pair: 16
297: Spearman Score: 0.5459
  Test pair: 274, Skip pair: 23
---------------
capital: Accuracy: 0.5643
  Top five: