In [2]:
from gensim.models.word2vec import Word2Vec

modeldir = '../../compass/1-year/model-cbow/'
models_cbow = []

for i in range(1946, 2024):
    year = modeldir + str(i) + '.model'
    model = Word2Vec.load(year)
    models_cbow.append(model)

In [6]:
import os

csv_folder = 'analogy_words'

# Ensure the output folder exists
os.makedirs(csv_folder, exist_ok=True)

In [14]:
import pandas as pd

def analogy(models, search_word, search_year):
    """
    Saves the top 10 most similar words for each year into a single CSV file.
    
    Args:
        models (list): List of word2vec models sorted by year.
        search_word (str): The word to search for in the models.
        search_year (int): The starting year corresponding to the first model in the list.
        csv_folder (str): The folder path where the CSV file will be saved.
    """
    # Define the output CSV file path
    csv_file = os.path.join(csv_folder, f"{search_word}_{search_year}.csv")
    search_word = models_cbow[search_year-1946].wv[search_word]
    # Initialize the starting year
    year = 1946
    # Initialize the data list with headers
    headers = ['year'] + [f"top {i}" for i in range(1, 11)]
    data = []

    # Iterate over the models
    for model in models:
        # Get the top 10 most similar words to the search word
        similar_words = [word for word, _ in model.wv.most_similar([search_word], topn=10)]
        # Append the year and the list of top 10 words to the data
        data.append([year] + similar_words)
        year += 1
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data, columns=headers)
    
    # Save to CSV
    df.to_csv(csv_file, index=False)

In [15]:
analogy(models_cbow, '电脑', 2023)

In [16]:
analogy(models_cbow, '手机', 2023)

In [22]:
analogy(models_cbow, 'CD', 2020)

In [23]:
analogy(models_cbow, '电动车', 2023)

In [24]:
analogy(models_cbow, '微信', 2023)

In [25]:
analogy(models_cbow, '微博', 2023)