Testing temporal word embeddings against the ChiWUG COMPARE score.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec

df = pd.read_csv('chi_wug/stats_groupings.csv', sep='\t')

lemma = df[['lemma', 'COMPARE']].to_records(index=False).tolist()
lemma = sorted(lemma, key=lambda x: x[1])


def cosine_sim(x,y):
    num = np.dot(x, y)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom


def word_compare(model1, model2, lemma, f):
    word_compare_std = []
    word_compare_pre = []
    for word, COMPARE in lemma:
        if word in model1.wv.vocab and word in model2.wv.vocab:
            word_compare_std.append(float(COMPARE))
            word_vec1 = model1.wv[word]
            word_vec2 = model2.wv[word]
            cos_sim = cosine_sim(word_vec1, word_vec2)
            word_compare_pre.append(cos_sim)
            if f is not None:
                f.write(word.ljust(6, ' ') + f'{COMPARE:.5f}   {cos_sim:.5f}\n')
        else:
            print('Skip:', word)
    
    spear_coef, p_value = stats.spearmanr(word_compare_std, word_compare_pre)
    print("Spearman Score: " + str(spear_coef))
    print("P value: ", str(p_value))

In [3]:
for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../compass/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../compass/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    f = open('chi_wug/compass-' + model_type + '.txt', 'w', encoding='utf-8')
    word_compare(model_before, model_after, lemma, f)
    f.close()
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: 0.40328408334518023
P value:  0.009870551741013287
...........................
Testing cbow based Word2Vec
Spearman Score: 0.31630308165579396
P value:  0.04676514067721062
...........................


In [2]:
for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../alignment/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../alignment/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    f = open('chi_wug/alignment-' + model_type + '.txt', 'w', encoding='utf-8')
    word_compare(model_before, model_after, lemma, f)
    f.close()
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: 0.4182031548323565
P value:  0.007244593660021611
...........................
Testing cbow based Word2Vec
Spearman Score: 0.48257097269526794
P value:  0.001612647685297571
...........................


Model works especially unwell with one-character "words", this may be due to the nature of Chinese (that boundary between characters and words can be obscure). Therefore, we also try removing one-character test examples and test the rest.

In [4]:
lemma_no_ch = [pair for pair in lemma if len(pair[0]) > 1]

for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../compass/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../compass/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    word_compare(model_before, model_after, lemma_no_ch, None)
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: 0.6780776613193285
P value:  7.338397139077492e-05
...........................
Testing cbow based Word2Vec
Spearman Score: 0.626044238197086
P value:  0.0003660212465805694
...........................


In [5]:
for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../alignment/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../alignment/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    word_compare(model_before, model_after, lemma_no_ch, None)
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: 0.7240861617642587
P value:  1.3263676117849011e-05
...........................
Testing cbow based Word2Vec
Spearman Score: 0.7101192955577621
P value:  2.3065946929965612e-05
...........................


And we test the one-character examples only.

In [6]:
lemma_only_ch = [pair for pair in lemma if len(pair[0]) == 1]

for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../compass/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../compass/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    word_compare(model_before, model_after, lemma_only_ch, None)
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: 0.027972027972027972
P value:  0.9312343512018808
...........................
Testing cbow based Word2Vec
Spearman Score: 0.21678321678321683
P value:  0.49855598552418856
...........................


In [7]:
lemma_only_ch = [pair for pair in lemma if len(pair[0]) == 1]

for model_type in ['sgns', 'cbow']:
    print('Testing ' + model_type + ' based Word2Vec')
    print('===========================')
    model_path_before = '../../alignment/2-slices/model-' + model_type + '/1954-1978.model'
    model_before = Word2Vec.load(model_path_before)
    model_path_after = '../../alignment/2-slices/model-' + model_type + '/1979-2003.model'
    model_after = Word2Vec.load(model_path_after)
    word_compare(model_before, model_after, lemma_only_ch, None)
    print('...........................')

Testing sgns based Word2Vec
Spearman Score: -0.06293706293706294
P value:  0.8459309212287789
...........................
Testing cbow based Word2Vec
Spearman Score: 0.18181818181818185
P value:  0.5717012385276553
...........................
