In [2]:
import csv

csv_file = 'temp_analogy.csv'

In [3]:
politicians = ['ch_chairman', 'ch_premier', 'us_president', 'us_secretary',
               'uk_minister', 'fr_president', 'de_premier', 'jp_premier', 'kr_president']

fieldnames = ['model', 'type', 'depth']

for politician in politicians:
    fieldnames.extend([
        f'{politician}_mrr', f'{politician}_mp1', f'{politician}_mp5', f'{politician}_mp10'
    ])

fieldnames.extend(['total_mrr', 'total_mp1', 'total_mp5', 'total_mp10'])

with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

In [11]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec


def reciprocal_rank(neighbours, target_words):
    for rank, word in enumerate(neighbours, start=1):
        if word in target_words:
            return 1.0 / rank
    # If target word not in 'neighbours', return 0
    return 0.0


def precision(neighbours, target_words, k):
    for target_word in target_words:
        if target_word in neighbours[:k]:
            return 1.0
    return 0.0


def temporal_analogy(models):
    analogy_scores_list = []

    df = pd.read_csv('facts/politicians.csv')
    columns = politicians

    for depth in range (5, 85, 5):
        reciprocal_all = 0
        precision_1_all = 0
        precision_5_all = 0
        precision_10_all = 0
        total_all = 0
        analogy_scores = [depth]
        for column in columns:
            test_pair_num = 0
            reciprocal = 0
            precision_1 = 0
            precision_5 = 0
            precision_10 = 0
            non_empty_rows = df[df[column].notna()][['year', column]]
            
            for index, row in non_empty_rows.iterrows():
                search_year = row['year']
                column_value = row[column]
                split_values = column_value.split('/')
                if len(split_values) == 1:
                    search_word = split_values[0]
                    if search_word in models[search_year-1946].wv:
                        search_vector = models[search_year-1946].wv[search_word]
                        other_rows = non_empty_rows[(non_empty_rows.index != index) & 
                            (abs(non_empty_rows['year'] - search_year) <= depth) &
                            (abs(non_empty_rows['year'] - search_year) > depth - 5)]
                        
                        if not other_rows.empty:
                            for _, other_row in other_rows.iterrows():
                                target_year = other_row['year']
                                test_pair_num += 1
                                other_column_value = other_row[column]
                                target_words = other_column_value.split('/')
                                predicted = models[target_year-1946].wv.most_similar([search_vector])
                                neighbours = [pair[0] for pair in predicted]
                                reciprocal += reciprocal_rank(neighbours, target_words)
                                precision_1 += precision(neighbours, target_words, 1)
                                precision_5 += precision(neighbours, target_words, 5)
                                precision_10 += precision(neighbours, target_words, 10)
            
            if test_pair_num != 0:
                reciprocal_all += reciprocal
                precision_1_all += precision_1
                precision_5_all += precision_5
                precision_10_all += precision_10
                total_all += test_pair_num
                analogy_scores.extend([reciprocal / test_pair_num, 
                    precision_1 / test_pair_num,
                    precision_5 / test_pair_num,
                    precision_10 / test_pair_num])
            else:
                analogy_scores.extend([None, None, None, None])

        if total_all != 0:
            analogy_scores.extend([reciprocal_all / total_all,
                precision_1_all / total_all,
                precision_5_all / total_all,
                precision_10_all / total_all])
        else:
            analogy_scores.extend([None, None, None, None])

        analogy_scores_list.append(analogy_scores)
    
    return analogy_scores_list


def temp_ana_eval(models, m_name, m_type):
    for scores in temporal_analogy(models):
        row = [m_name, m_type]
        row.extend(scores)
        
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(row)

In [16]:
import pandas as pd
import matplotlib.pyplot as plt


def draw_graph():
    df = pd.read_csv('temp_analogy.csv')

    combinations = df[['model', 'type']].drop_duplicates()
    columns = politicians + ['total']

    for column in columns:
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

        for index, row in combinations.iterrows():
            model_name = row['model']
            model_type = row['type']
            subset = df[(df['model'] == model_name) & (df['type'] == model_type)]
            subset = subset.sort_values(by='depth')

            axes[0][0].plot(subset['depth'], subset[column + '_mrr'], label=f'{model_name}-{model_type}')
            axes[0][0].set_title('Analogy ' + column + ' MRR')
            axes[0][0].set_xlabel('depth')
            axes[0][0].set_ylabel('Mean Reciprocal Rank')
            
            axes[0][1].plot(subset['depth'], subset[column + '_mp1'], label=f'{model_name}-{model_type}')
            axes[0][1].set_title('Analogy ' + column + ' MP@1')
            axes[0][1].set_xlabel('depth')
            axes[0][1].set_ylabel('Mean Precision @ 1')

            axes[1][0].plot(subset['depth'], subset[column + '_mp5'], label=f'{model_name}-{model_type}')
            axes[1][0].set_title('Analogy ' + column + ' MP@5')
            axes[1][0].set_xlabel('depth')
            axes[1][0].set_ylabel('Mean Precision @ 5')

            axes[1][1].plot(subset['depth'], subset[column + '_mp10'], label=f'{model_name}-{model_type}')
            axes[1][1].set_title('Analogy ' + column + ' MP@10')
            axes[1][1].set_xlabel('depth')
            axes[1][1].set_ylabel('Mean Precision @ 10')
        
        axes[0][0].legend()
        axes[0][1].legend()
        axes[1][0].legend()
        axes[1][1].legend()
        fig.tight_layout()
        plt.savefig('graphic/' + column + '.png')
        plt.close()

In [8]:
modeldir = '../../compass/1-year/model-sgns/'
models_sgns = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_sgns.append(model)

In [12]:
temp_ana_eval(models_sgns, 'compass', 'sgns')

In [13]:
modeldir = '../../compass/1-year/model-cbow/'
models_cbow = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_cbow.append(model)

In [14]:
temp_ana_eval(models_cbow, 'compass', 'cbow')

In [17]:
draw_graph()