In [8]:
import csv

csv_file = 'temp_analogy.csv'

In [24]:
politicians = ['ch_leader', 'ch_premier', 'us_president', 'us_secretary', 'uk_minister',
               'fr_president', 'de_chancellor', 'jp_minister', 'kr_president']

fieldnames = ['model', 'type', 'depth']

for politician in politicians:
    fieldnames.extend([
        f'{politician}_mrr', f'{politician}_mp1', f'{politician}_mp5', f'{politician}_mp10'
    ])

fieldnames.extend(['total_mrr', 'total_mp1', 'total_mp5', 'total_mp10'])

with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

In [23]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from gensim.models.word2vec import Word2Vec


def reciprocal_rank(neighbours, target_words):
    for rank, word in enumerate(neighbours, start=1):
        if word in target_words:
            return 1.0 / rank
    # If target word not in 'neighbours', return 0
    return 0.0


def precision(neighbours, target_words, k):
    for target_word in target_words:
        for neighbour in neighbours[:k]:
            if neighbour in target_word:
                return 1.0
    return 0.0


def temporal_analogy(models):
    analogy_scores_list = []

    df = pd.read_csv('facts/politicians.csv')
    columns = politicians

    for depth in range (5, 85, 5):
        reciprocal_all = 0
        precision_1_all = 0
        precision_5_all = 0
        precision_10_all = 0
        total_all = 0
        analogy_scores = [depth]
        for column in columns:
            test_pair_num = 0
            reciprocal = 0
            precision_1 = 0
            precision_5 = 0
            precision_10 = 0
            non_empty_rows = df[df[column].notna()][['year', column]]
            
            for index, row in non_empty_rows.iterrows():
                search_year = row['year']
                column_value = row[column]
                split_values = column_value.split('/')
                if len(split_values) == 1:
                    search_word = split_values[0]
                    if search_word in models[search_year-1946].wv:
                        search_vector = models[search_year-1946].wv[search_word]
                        other_rows = non_empty_rows[(non_empty_rows.index != index) & 
                            (abs(non_empty_rows['year'] - search_year) <= depth) &
                            (abs(non_empty_rows['year'] - search_year) > depth - 5)]
                        
                        if not other_rows.empty:
                            for _, other_row in other_rows.iterrows():
                                target_year = other_row['year']
                                test_pair_num += 1
                                other_column_value = other_row[column]
                                target_words = other_column_value.split('/')
                                predicted = models[target_year-1946].wv.most_similar([search_vector])
                                neighbours = [pair[0] for pair in predicted]
                                reciprocal += reciprocal_rank(neighbours, target_words)
                                precision_1 += precision(neighbours, target_words, 1)
                                precision_5 += precision(neighbours, target_words, 5)
                                precision_10 += precision(neighbours, target_words, 10)
            
            if test_pair_num != 0:
                reciprocal_all += reciprocal
                precision_1_all += precision_1
                precision_5_all += precision_5
                precision_10_all += precision_10
                total_all += test_pair_num
                analogy_scores.extend([reciprocal / test_pair_num, 
                    precision_1 / test_pair_num,
                    precision_5 / test_pair_num,
                    precision_10 / test_pair_num])
            else:
                analogy_scores.extend([None, None, None, None])

        if total_all != 0:
            analogy_scores.extend([reciprocal_all / total_all,
                precision_1_all / total_all,
                precision_5_all / total_all,
                precision_10_all / total_all])
        else:
            analogy_scores.extend([None, None, None, None])

        analogy_scores_list.append(analogy_scores)
    
    return analogy_scores_list


def temp_ana_eval(models, m_name, m_type):
    for scores in temporal_analogy(models):
        row = [m_name, m_type]
        row.extend(scores)
        
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(row)

In [25]:
import pandas as pd
import matplotlib.pyplot as plt


def draw_graph():
    df = pd.read_csv('temp_analogy.csv')

    combinations = df[['model', 'type']].drop_duplicates()
    columns = politicians + ['total']

    for column in columns:
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

        for index, row in combinations.iterrows():
            model_name = row['model']
            model_type = row['type']
            subset = df[(df['model'] == model_name) & (df['type'] == model_type)]
            subset = subset.sort_values(by='depth')

            axes[0][0].plot(subset['depth'], subset[column + '_mrr'], label=f'{model_name}-{model_type}')
            axes[0][0].set_title('Analogy ' + column + ' MRR')
            axes[0][0].set_xlabel('depth')
            axes[0][0].set_ylabel('Mean Reciprocal Rank')
            
            axes[0][1].plot(subset['depth'], subset[column + '_mp1'], label=f'{model_name}-{model_type}')
            axes[0][1].set_title('Analogy ' + column + ' MP@1')
            axes[0][1].set_xlabel('depth')
            axes[0][1].set_ylabel('Mean Precision @ 1')

            axes[1][0].plot(subset['depth'], subset[column + '_mp5'], label=f'{model_name}-{model_type}')
            axes[1][0].set_title('Analogy ' + column + ' MP@5')
            axes[1][0].set_xlabel('depth')
            axes[1][0].set_ylabel('Mean Precision @ 5')

            axes[1][1].plot(subset['depth'], subset[column + '_mp10'], label=f'{model_name}-{model_type}')
            axes[1][1].set_title('Analogy ' + column + ' MP@10')
            axes[1][1].set_xlabel('depth')
            axes[1][1].set_ylabel('Mean Precision @ 10')
        
        axes[0][0].legend()
        axes[0][1].legend()
        axes[1][0].legend()
        axes[1][1].legend()
        fig.tight_layout()
        plt.savefig('graphic/' + column + '.png')
        plt.close()

In [12]:
modeldir = '../../compass/1-year/model-sgns/'
models_sgns = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_sgns.append(model)

In [26]:
temp_ana_eval(models_sgns, 'compass', 'sgns')

In [14]:
modeldir = '../../compass/1-year/model-cbow/'
models_cbow = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_cbow.append(model)

In [27]:
temp_ana_eval(models_cbow, 'compass', 'cbow')

In [28]:
draw_graph()

Draw all politicians on the same graph for comparison.

In [29]:
import pandas as pd
import matplotlib.pyplot as plt

def draw_compare():
    df = pd.read_csv('temp_analogy.csv')

    # Filter data for the specified model and type
    df_filtered = df[(df['model'] == 'compass') & (df['type'] == 'cbow')]

    # Define the columns to plot (politicians)
    columns = ['ch_leader', 'ch_premier', 'us_president', 'us_secretary', 'uk_minister',
               'fr_president', 'de_chancellor', 'jp_minister', 'kr_president']

    # Create subplots
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

    # Iterate over metrics to create subplots
    metrics = ['mrr', 'mp1', 'mp5', 'mp10']
    titles = ['Analogy MRR', 'Analogy MP@1', 'Analogy MP@5', 'Analogy MP@10']
    
    for i, metric in enumerate(metrics):
        ax = axes[i // 2][i % 2]
        for col in columns:
            subset = df_filtered.sort_values(by='depth')
            ax.plot(subset['depth'], subset[f'{col}_{metric}'], label=col)
        
        ax.set_title(titles[i])
        ax.set_xlabel('depth')
        ax.set_ylabel('Mean Reciprocal Rank' if metric == 'mrr' else f'Mean Precision @ {metric[-1]}')
        ax.legend()
    
    fig.tight_layout()
    plt.savefig('compare.png')
    plt.close()

# Run the function
draw_compare()

Count the frequency of politician names each year.

In [22]:
import pandas as pd
import os

# Paths to the files and folders
politicians_file = 'facts/politicians.csv'
frequency_folder = '../../corpus/frequency'
output_file = 'politician_freq.csv'

# Read the politicians CSV file
politicians_df = pd.read_csv(politicians_file, encoding='utf-8')

# Get a list of all years from the politicians CSV file
years = politicians_df['year'].tolist()

columns = ['ch_leader', 'ch_premier', 'us_president', 'us_secretary', 'uk_minister',
           'fr_president', 'de_chancellor', 'jp_minister', 'kr_president']

# Initialize a dictionary to store the results
result = {'year': years}

# Add the other columns
for col in columns:
    result[col] = []

# Iterate over each year
for year in years:
    # Path to the frequency file for the current year
    frequency_file = os.path.join(frequency_folder, f'{year}.txt')
    # Initialize a dictionary to store frequencies for each politician
    year_counts = {col: 0 for col in columns}

    # If the frequency file exists for the current year
    if os.path.exists(frequency_file):
        with open(frequency_file, 'r', encoding='utf-8') as file:
            # Read the frequency data into a dictionary
            frequency_data = {}
            for line in file:
                word, count = line.strip().split()
                frequency_data[word] = int(count)
        
        # Process each politician category
        for col in columns:
            # Get the list of names for the current politician category
            names = politicians_df.loc[politicians_df['year'] == year, col].values[0]

            # If the names are NaN, keep the count as NaN
            if pd.isna(names):
                year_counts[col] = None
            else:
                names = names.split('/')  # Handle multiple names separated by '/'
                # Sum the counts for each name in the frequency data
                for name in names:
                    if name in frequency_data:
                        year_counts[col] += frequency_data[name]
    
    # Append the results for the current year to the result dictionary
    for col in columns:
        result[col].append(year_counts[col])

average_row = []

for col in columns:
    # Calculate the average, excluding NaN values
    values = [val for val in result[col] if pd.notna(val)]
    average = sum(values) / len(values) if values else 0
    average_row.append(average)

# Add the average row to the result
result['year'].append('average')  # Add a label for the average row

for i, col in enumerate(columns):
    result[col].append(average_row[i])

# Convert the result dictionary to a DataFrame and save it as a CSV file
result_df = pd.DataFrame(result)
result_df.to_csv(output_file, index=False, encoding='utf-8')

Count the frequency for "韩国" and "北朝鲜".

In [19]:
import os
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']

# Define the words to search for
words_to_search = ["韩国", "南朝鲜"]

# Initialize a dictionary to store the results
frequency_data = {word: [] for word in words_to_search}
frequency_data['year'] = []

# Path to the frequency folder
frequency_folder = '../../corpus/frequency'

# Iterate over each year file in the folder
for year_file in sorted(os.listdir(frequency_folder)):
    if year_file.endswith('.txt'):
        year = int(year_file.split('.')[0])  # Extract the year from the file name
        frequency_data['year'].append(year)
        
        # Initialize the counts for the current year
        year_counts = {word: 0 for word in words_to_search}

        # Open and read the file
        with open(os.path.join(frequency_folder, year_file), 'r', encoding='utf-8') as f:
            for line in f:
                word, count = line.strip().split()
                count = int(count)
                
                # Update the counts if the word is one of the target words
                if word in words_to_search:
                    year_counts[word] += count

        # Append the counts for the current year to the result dictionary
        for word in words_to_search:
            frequency_data[word].append(year_counts[word])

# Convert the result dictionary to a DataFrame for easier manipulation
df = pd.DataFrame(frequency_data)

In [20]:
# Plot the data
plt.figure(figsize=(6, 4), dpi=200)
for word in words_to_search:
    plt.plot(df['year'], df[word], label=word)

plt.xlabel('Year')
plt.ylabel('Frequency')
plt.legend()
plt.savefig('south_korea.png')
plt.close()

Visualize the neighbourhood/path of South Korean presidents on 5-year slices.

In [14]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.manifold import TSNE
import csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from adjustText import adjust_text


color_list = [
    'limegreen', 'forestgreen', 'mediumseagreen', 'darkcyan',
    'turquoise', 'deepskyblue', 'dodgerblue', 'royalblue', 'darkblue', 'darkslateblue',
    'rebeccapurple', 'blueviolet', 'darkmagenta', 'orchid', 'plum'
]


def adjust_figsize(x, y, max_pixels=20):
    width = max(x) - min(x)
    height = max(y) - min(y)
    current_pixels = max(width, height)
    if current_pixels > max_pixels:
        ratio = max_pixels / current_pixels
        width = int(width * ratio)
        height = int(height * ratio)
    return width, height


def neighbour_path(models, names, rs=49):
    """
    Search for the nearest neighbours for target names at different times
    Plot the path of name vectors across time in their neighbourhood
    """

    csv_file = 'kr_neighbour.csv'
    fieldnames = ['year', 'name'] + ['neighbour' + str(i + 1) for i in range(10)]

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

    labels = []
    vectors = []
    year = 1950

    for i, model in enumerate(models):
        name = names[i]
        labels.append(name + str(year))
        vectors.append(model.wv[name])
        neighbours = model.wv.most_similar(name, topn=10)

        row = [year, name] + [pair[0] for pair in neighbours]
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(row)

        for neighbour in neighbours[:5]:
            # print(neighbour)
            labels.append(neighbour[0])
            vectors.append(model.wv[neighbour[0]])
        year += 5

    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=1000, random_state=rs)
    flattend_2d = tsne_model.fit_transform(vectors)

    x, y = [], []
    for value in flattend_2d:
        x.append(value[0])
        y.append(value[1])
    
    x_line, y_line = [], []
    for i in range(len(labels) // 6):
        x_line.append(x[i * 6])
        y_line.append(y[i * 6])

    # print(max(x), min(x), max(y), min(y))
    plt.figure(figsize=adjust_figsize(x, y))
    plt.axis('off')
    plt.plot(x_line, y_line, color='steelblue')

    texts = []
    for i in range(len(labels) // 6):
        index = i * 6
        color = color_list[i]
        plt.scatter(x[index], y[index], s=40, c=color)
        texts.append(plt.annotate(labels[index], xy=(x[index], y[index]),
            fontproperties=FontProperties('Noto Sans SC', weight='bold'), fontsize = 25, color=color))
        for j in range(5):
            index += 1
            plt.scatter(x[index], y[index], s=10, c=color)
            texts.append(plt.annotate(labels[index], xy=(x[index], y[index]),
                fontproperties=FontProperties('Noto Sans SC'), fontsize = 15, color=color))
    
    adjust_text(texts, iter_lim=1000)
    plt.tight_layout()
    plt.savefig('kr_path.png', bbox_inches='tight')
    plt.close()

In [4]:
from gensim.models.word2vec import Word2Vec

modeldir = '../../compass/5-year/model-cbow/'
models_5 = []

for i in range(1950, 2025, 5):
    fiveyear = modeldir + str(i) + '-' + str(i + 4) + '.model'
    model = Word2Vec.load(fiveyear)
    models_5.append(model)

In [15]:
names = ['李承晚', '李承晚', '朴正熙', '朴正熙', '朴正熙', '朴正熙', '全斗焕',
         '全斗焕', '卢泰愚', '金泳三', '金大中', '卢武铉', '李明博', '朴槿惠', '文在寅']

neighbour_path(models_5, names)