In [2]:
from gensim.models.word2vec import Word2Vec

modeldir = '../../compass/1-year/model-cbow/'
models_cbow = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_cbow.append(model)

In [6]:
import os

csv_folder = 'analogy_words'

# Ensure the output folder exists
os.makedirs(csv_folder, exist_ok=True)

In [14]:
import pandas as pd

def analogy(models, search_word, search_year):
    """
    Saves the top 10 most similar words for each year into a single CSV file.
    
    Args:
        models (list): List of word2vec models sorted by year.
        search_word (str): The word to search for in the models.
        search_year (int): The starting year corresponding to the first model in the list.
        csv_folder (str): The folder path where the CSV file will be saved.
    """
    # Define the output CSV file path
    csv = os.path.join(csv_folder, f"{search_word}_{search_year}.csv")
    search_word = models_cbow[search_year-1946].wv[search_word]
    # Initialize the starting year
    year = 1946
    # Initialize the data list with headers
    headers = ['year'] + [f"top {i}" for i in range(1, 11)]
    data = []

    # Iterate over the models
    for model in models:
        # Get the top 10 most similar words to the search word
        similar_words = [word for word, _ in model.wv.most_similar([search_word], topn=10)]
        # Append the year and the list of top 10 words to the data
        data.append([year] + similar_words)
        year += 1
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data, columns=headers)
    
    # Save to CSV
    df.to_csv(csv_file, index=False)

In [15]:
analogy(models_cbow, '电脑', 2023)

In [16]:
analogy(models_cbow, '手机', 2023)

In [22]:
analogy(models_cbow, 'CD', 2020)

In [23]:
analogy(models_cbow, '电动车', 2023)

In [24]:
analogy(models_cbow, '微信', 2023)

In [25]:
analogy(models_cbow, '微博', 2023)

In [26]:
analogy(models_cbow, '俄乌', 2022)

In [42]:
analogy(models_cbow, '新冠', 2022)

Note: In the generated word occurrence .txt file, we manually removed some lines and saved seperately.

In [43]:
import pandas as pd
import os

def word_occurrence(csv_file):
    """
    Get word occurrences and saves the results to a .txt file.
    
    Args:
        csv_file (str): Path to the CSV file with columns for year and top 1 to top 10 words.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Initialize a dictionary to store the occurrences of each word
    word_dict = {}
    
    # Process the DataFrame
    for _, row in df.iterrows():
        year = row['year']
        for word in row[1:]:  # Assuming columns from index 1 to end are words
            if pd.notna(word):  # Check if the word is not NaN
                if word in word_dict:
                    word_dict[word].append(year)
                else:
                    word_dict[word] = [year]

    # Filter words by the minimum occurrence threshold
    filtered_word_dict = {word: years for word, years in word_dict.items()}
    
    # Prepare the output file name
    base_name = os.path.basename(csv_file)
    file_name, _ = os.path.splitext(base_name)
    output_file = f'{file_name}.txt'
    
    # Write the filtered word dictionary to the .txt file
    with open(output_file, 'w', encoding='utf-8') as f:
        for word, years in filtered_word_dict.items():
            f.write(f'{word}: {", ".join(map(str, years))}\n')

In [95]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def plot_occurrence(search_words, occurrence, disambiguation):
    # Read the occurrence data
    word_occurrences = {}
    with open(occurrence, 'r', encoding='utf-8') as file:
        for line in file:
            word, years = line.split(": ")
            years = list(map(int, years.strip('[]\n').split(',')))
            word_occurrences[word] = years

    # Read the disambiguation data
    disambiguation_dict = {}
    with open(disambiguation, 'r', encoding='utf-8') as file:
        for line in file:
            synonyms = line.strip().split('/')
            target_word = synonyms[0]
            for synonym in synonyms[1:]:
                disambiguation_dict[synonym] = target_word

    # Merge occurrences of synonyms to the target words
    combined_occurrences = {word: [] for word in search_words.keys()}

    for word, years in word_occurrences.items():
        target_word = disambiguation_dict.get(word, word)
        # Map synonym to target word, or use the original if not found
        if target_word in combined_occurrences:
            combined_occurrences[target_word].extend(years)

    # Remove duplicates and sort the years
    for word in combined_occurrences:
        combined_occurrences[word] = sorted(set(combined_occurrences[word]))

    # Plotting
    plt.figure(figsize=(6,4), dpi=200)
    this_font = FontProperties(family='Noto Sans SC')  # Set font for Chinese characters

    color_list = [
        'lightgreen', 'limegreen', 'forestgreen', 'mediumseagreen', 'darkcyan',
        'turquoise', 'deepskyblue', 'dodgerblue', 'royalblue', 'darkblue',
        'darkslateblue', 'rebeccapurple', 'blueviolet', 'darkmagenta', 'orchid', 'plum'
    ]

    # Prepare y-ticks
    y_len = range(0, len(combined_occurrences) + 2)
    
    # Modify y_labels to include both key (Chinese) and its value (English translation) correctly
    y_labels = [f"{key} {''.join(search_words.get(key, ''))}" for key in combined_occurrences.keys()]
    y_labels.insert(0, '')  # Add empty labels for padding
    y_labels.append('')
    
    x, y = [], []
    # Adjust indexing to avoid KeyError by matching combined_occurrences keys with y_labels without padding
    for i, word in enumerate(combined_occurrences.keys(), 1):
        for year in combined_occurrences[word]:
            x.append(year)
            y.append(i)
            
    # Plot settings
    plt.yticks(y_len, y_labels, fontproperties=this_font, fontsize=8)
    plt.xlim(1940, 2030)
    xticks = list(range(1940, 2035, 10))
    plt.xticks(ticks=xticks, labels=[str(year) for year in xticks], fontsize=8)
    plt.grid(axis='x', linestyle=':')

    # Plot the data points
    for i in range(len(x)):
        plt.scatter(x[i], y[i], s=10, c=color_list[(y[i] - 1) % len(color_list)])
    plt.tight_layout()
    
    # Prepare the output file name
    base_name = os.path.basename(occurrence)
    file_name, _ = os.path.splitext(base_name)
    output_file = f'{file_name}.png'

    plt.savefig(output_file)
    plt.close()

In [97]:
word_occurrence('analogy_words/新冠_2022.csv')

In [107]:
search_disease = {
    "伤寒": "Typhoid Fever",
    "天花": "Smallpox",
    "白喉": "Diphtheria",
    "鼠疫": "Plague",
    "脑炎": "Encephalitis",
    "痢疾": "Dysentery",
    "百日咳": "Whooping Cough",
    "疟疾": "Malaria",
    "结核": "Tuberculosis",
    "肺炎": "Pneumonia",
    "流感": "Influenza",
    "肝炎": "Hepatitis",
    "艾滋病": "AIDS",
    "禽流感": "Avian Influenza",
    "非典": "SARS",
    "新冠": "COVID-19"
}

In [108]:
plot_occurrence(search_disease, '新冠_2022.txt', '新冠_merge.txt')

In [44]:
word_occurrence('analogy_words/俄乌_2022.csv')

In [109]:
search_conflict = {
    '美苏': 'US-Soviet/Russia',
    '朝鲜': 'North Korea',
    '越南': 'Vietnam',
    '台湾': 'Taiwan',
    '中东': 'Middle East',
    '印巴': 'India-Pakistan',
    '黎巴嫩': 'Lebanon',
    '美伊': 'US-Iraq',
    '两伊': 'Iran-Iraq',
    '阿以': 'Arab-Israel',
    '中美洲': 'Central America',
    '阿富汗': 'Afghanistan',
    '巴以': 'Palestine-Israel',
    '乌克兰': 'Ukraine',
    '叙利亚': 'Syria'
}

In [110]:
plot_occurrence(search_conflict, '俄乌_2022.txt', '俄乌_merge.txt')