Testing quality of static (compass) word embeddings (similarity task and analogy task) and temporal (target) embeddings.

In [1]:
import csv

csv_file = 'static_eval.csv'

In [2]:
fieldnames = ['model', 'type', 'slice', '240', '297',
    'capital_mrr', 'capital_mp1', 'capital_mp5', 'capital_mp10',
    'city_mrr', 'city_mp1', 'city_mp5', 'city_mp10',
    'family_mrr', 'family_mp1', 'family_mp5', 'family_mp10',
    'total_mrr', 'total_mp1', 'total_mp5', 'total_mp10']

with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec


def cosine_sim(x,y):
    num = np.dot(x, y)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom


def word_sim(model):
    sim_scores = []

    for sim in ['240', '297']:
        file = open('word_sim/' + sim + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        skip_pair_num = 0
        word_sim_std = []
        word_sim_pre = []
        for line in file:
            word1, word2, val_str = line.strip().split()[0:3]
            if word1 in model.wv and word2 in model.wv:
                test_pair_num += 1
                word_sim_std.append(float(val_str))
                word_vec1 = model.wv[word1]
                word_vec2 = model.wv[word2]
                cos_sim = cosine_sim(word_vec1, word_vec2)
                word_sim_pre.append(cos_sim)
            else:
                skip_pair_num += 1
                # print('Skip:', word1, word2)

        spear_coef, _ = stats.spearmanr(word_sim_std, word_sim_pre)
        sim_scores.append(spear_coef)

    return sim_scores


def reciprocal_rank(neighbours, target_word):
    for rank, word in enumerate(neighbours, start=1):
        if word == target_word:
            return 1.0 / rank
    # If target word not in 'neighbours', return 0
    return 0.0


def precision(neighbours, target_word, k):
    if target_word in neighbours[:k]:
        return 1.0
    return 0.0


def word_analogy(model):
    reciprocal_all = 0
    precision_1_all = 0
    precision_5_all = 0
    precision_10_all = 0
    total_all = 0
    analogy_scores = []

    for ana in ['capital', 'city', 'family']:
        file = open('word_analogy/' + ana + '.txt', 'r', encoding='utf-8')
        test_pair_num = 0
        reciprocal = 0
        precision_1 = 0
        precision_5 = 0
        precision_10 = 0
        for line in file:
            word1, word2, word3, word4 = line.strip().split()[0:4]
            if word1 in model.wv and word2 in model.wv and word3 in model.wv:
                test_pair_num += 1
                predicted = model.wv.most_similar(positive=[word2, word3], negative=[word1])
                neighbours = [pair[0] for pair in predicted]
                reciprocal += reciprocal_rank(neighbours, word4)
                precision_1 += precision(neighbours, word4, 1)
                precision_5 += precision(neighbours, word4, 5)
                precision_10 += precision(neighbours, word4, 10)
        
        reciprocal_all += reciprocal
        precision_1_all += precision_1
        precision_5_all += precision_5
        precision_10_all += precision_10
        total_all += test_pair_num
        analogy_scores.extend([reciprocal / test_pair_num, 
            precision_1 / test_pair_num,
            precision_5 / test_pair_num,
            precision_10 / test_pair_num])
    
    analogy_scores.extend([reciprocal_all / total_all,
        precision_1_all / total_all,
        precision_5_all / total_all,
        precision_10_all / total_all])
    
    return analogy_scores


def static_eval(model, m_name, m_type, m_slice):
    row = [m_name, m_type, m_slice]
    row.extend(word_sim(model))
    row.extend(word_analogy(model))
    
    with open(csv_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)

In [12]:
import pandas as pd
import matplotlib.pyplot as plt


def draw_graph(slices, folder, ticks, labels):
    df = pd.read_csv('static_eval.csv')
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
    combinations = df[['model', 'type']].drop_duplicates()

    for index, row in combinations.iterrows():
        model_name = row['model']
        model_type = row['type']
        subset = df[(df['model'] == model_name) & (df['type'] == model_type) & (df['slice'].isin(slices))]
        subset = subset.sort_values(by='slice')

        axes[0].plot(subset['slice'], subset['240'], label=f'{model_name}-{model_type}')
        axes[0].set_title('Word Similarity 240')
        axes[0].set_xlabel('year slice')
        axes[0].set_ylabel('Spearman Correlation')
        
        axes[1].plot(subset['slice'], subset['297'], label=f'{model_name}-{model_type}')
        axes[1].set_title('Word Similarity 297')
        axes[1].set_xlabel('year slice')
        axes[1].set_ylabel('Spearman Correlation')

    axes[0].legend()
    axes[0].set_xticks(ticks)
    axes[0].set_xticklabels(labels, rotation=45)
    axes[1].legend()
    axes[1].set_xticks(ticks)
    axes[1].set_xticklabels(labels, rotation=45)
    fig.tight_layout()
    plt.savefig(folder + 'word_sim.png')
    plt.close()

    for analogy in ['capital', 'city', 'family', 'total']:
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

        for index, row in combinations.iterrows():
            model_name = row['model']
            model_type = row['type']
            subset = df[(df['model'] == model_name) & (df['type'] == model_type) & (df['slice'].isin(slices))]
            subset = subset.sort_values(by='slice')

            axes[0][0].plot(subset['slice'], subset[analogy + '_mrr'], label=f'{model_name}-{model_type}')
            axes[0][0].set_title('Analogy ' + analogy.capitalize() + ' MRR')
            axes[0][0].set_xlabel('year slice')
            axes[0][0].set_ylabel('Mean Reciprocal Rank')
            
            axes[0][1].plot(subset['slice'], subset[analogy + '_mp1'], label=f'{model_name}-{model_type}')
            axes[0][1].set_title('Analogy ' + analogy.capitalize() + ' MP@1')
            axes[0][1].set_xlabel('year slice')
            axes[0][1].set_ylabel('Mean Precision @ 1')

            axes[1][0].plot(subset['slice'], subset[analogy + '_mp5'], label=f'{model_name}-{model_type}')
            axes[1][0].set_title('Analogy ' + analogy.capitalize() + ' MP@5')
            axes[1][0].set_xlabel('year slice')
            axes[1][0].set_ylabel('Mean Precision @ 5')

            axes[1][1].plot(subset['slice'], subset[analogy + '_mp10'], label=f'{model_name}-{model_type}')
            axes[1][1].set_title('Analogy ' + analogy.capitalize() + ' MP@10')
            axes[1][1].set_xlabel('year slice')
            axes[1][1].set_ylabel('Mean Precision @ 10')
        
        axes[0][0].legend()
        axes[0][0].set_xticks(ticks)
        axes[0][0].set_xticklabels(labels, rotation=45)
        axes[0][1].legend()
        axes[0][1].set_xticks(ticks)
        axes[0][1].set_xticklabels(labels, rotation=45)
        axes[1][0].legend()
        axes[1][0].set_xticks(ticks)
        axes[1][0].set_xticklabels(labels, rotation=45)
        axes[1][1].legend()
        axes[1][1].set_xticks(ticks)
        axes[1][1].set_xticklabels(labels, rotation=45)
        fig.tight_layout()
        plt.savefig(folder + analogy + '.png')
        plt.close()

Test the quality of 2-slices embeddings.

In [6]:
for year_slice in ['1954-1978', '1979-2003']:
    for model_name in ['compass', 'alignment']:
        for model_type in ['sgns', 'cbow']:
            model_path = '../../' + model_name + '/2-slices/model-' + model_type + '/' + year_slice + '.model'
            model = Word2Vec.load(model_path)
            static_eval(model, model_name, model_type, year_slice)

In [13]:
two_slices = ['1954-1978', '1979-2003']
labels = ['before', 'after']
ticks = [0, 1]
draw_graph(two_slices, 'graphic/2-slices/', ticks, labels)

Test the quality of 5-year and 1-year slices.

In [8]:
for i in range(1945, 2025, 5):
    year_slice = str(i) + '-' + str(i + 4)
    for model_name in ['compass', 'alignment']:
        for model_type in ['sgns', 'cbow']:
            model_path = '../../' + model_name + '/5-year/model-' + model_type + '/' + year_slice + '.model'
            model = Word2Vec.load(model_path)
            static_eval(model, model_name, model_type, year_slice)

In [14]:
five_year = [str(i) + '-' + str(i + 4) for i in range(1945, 2025, 5)]
labels = [str(i) for i in range(1945, 2025, 5)]
ticks = range(len(labels))
draw_graph(five_year, 'graphic/5-year/', ticks, labels)

In [10]:
for i in range(1946, 2024):
    year_slice = str(i)
    for model_name in ['compass', 'alignment']:
        for model_type in ['sgns', 'cbow']:
            model_path = '../../' + model_name + '/1-year/model-' + model_type + '/' + year_slice + '.model'
            model = Word2Vec.load(model_path)
            static_eval(model, model_name, model_type, year_slice)

In [15]:
one_year = [str(i) for i in range(1946, 2024)]
labels = [str(i) for i in range(1945, 2030, 5)]
ticks = [i * 5 for i in range(len(labels))]
draw_graph(one_year, 'graphic/1-year/', ticks, labels)

The correlation between wordsim test scores, year and total word count each year.

In [14]:
import pandas as pd
import re
from scipy.stats import spearmanr

def is_numeric(value):
    return bool(re.fullmatch(r'\d+', str(value)))

raw_stats = pd.read_csv('../../preprocess/raw_stats/raw_stats.csv')
stat_eval = pd.read_csv('static_eval.csv')
stat_eval = stat_eval[stat_eval['slice'].apply(is_numeric)]
stat_eval['year'] = stat_eval['slice'].astype(int)

merged_data = pd.merge(stat_eval, raw_stats, left_on='year', right_on='year')

In [17]:
corr_year_length, _ = spearmanr(raw_stats['year'], raw_stats['total_length'])
print('The Spearman correlation between year and word count is', corr_year_length)

The Spearman correlation between year and word count is 0.7548274510299826


In [16]:
correlation_results = []

for model in merged_data['model'].unique():
    for type_ in merged_data['type'].unique():
        subset = merged_data[(merged_data['model'] == model) & (merged_data['type'] == type_)]
        
        corr_240_year, _ = spearmanr(subset['240'], subset['year'])
        corr_297_year, _ = spearmanr(subset['297'], subset['year'])
        corr_240_length, _ = spearmanr(subset['240'], subset['total_length'])
        corr_297_length, _ = spearmanr(subset['297'], subset['total_length'])
        
        correlation_results.append({
            'model': model,
            'type': type_,
            '240_vs_year': corr_240_year,
            '297_vs_year': corr_297_year,
            '240_vs_wordcount': corr_240_length,
            '297_vs_wordcount': corr_297_length
        })

correlation_df = pd.DataFrame(correlation_results)
correlation_df.to_csv('correlation.csv', index=False)