In [2]:
from gensim.models.word2vec import Word2Vec

modeldir = '../../compass/5-year/model-cbow/'
models = []

for i in range(1945, 2025, 5):
    fiveyear = modeldir + str(i) + '-' + str(i + 4) + '.model'
    model = Word2Vec.load(fiveyear)
    models.append(model)

In [4]:
import numpy as np
import csv

def cosine_dist(x,y):
    num = np.dot(x, y)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return 1 - num / denom

word_list = []
with open('../../corpus/vocab_filter.txt', encoding='utf-8') as f:
    for line in f:
        word_list.extend(line.split())

results = []

for word in word_list:
    word_vectors = [model.wv[word] for model in models if word in model.wv]

    # Compute cosine distances between adjacent models
    distances = []
    for i in range(len(word_vectors) - 1):
        dist = cosine_dist(word_vectors[i], word_vectors[i + 1])
        distances.append(dist)

    # Calculate max and sum of distances
    max_dist = max(distances)
    sum_dist = sum(distances)

    # Calculate cosine distance between first and last model
    first_last_dist = cosine_dist(word_vectors[1], word_vectors[-2])

    # Append results
    results.append([word, max_dist, sum_dist, first_last_dist])

csv_file = 'word_distances.csv'

with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['word', 'max dist', 'sum dist', 'first-last dist'])
    writer.writerows(results)

Automatically add code blocks to judgement.ipynb and draw.ipynb

In [1]:
import nbformat

# Load the notebook
notebook_path = 'judgement.ipynb'
with open(notebook_path, 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

# Read the filtered words
words_file_path = 'filtered_words.txt'
with open(words_file_path, 'r', encoding='utf-8') as f:
    words = [line.strip() for line in f]

# Add code cells for each word
for word in words:
    # Create a new code cell
    new_cell = nbformat.v4.new_code_cell(f'print_neighbourhood("{word}")')
    
    # Append the new cell to the notebook
    notebook.cells.append(new_cell)

# Save the updated notebook
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbformat.write(notebook, f)

In [1]:
import nbformat

# Load the notebook
notebook_path = 'draw.ipynb'
with open(notebook_path, 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

# Read the filtered words
words_file_path = 'filtered_words.txt'
with open(words_file_path, 'r', encoding='utf-8') as f:
    words = [line.strip() for line in f]

# Add code cells for each word
for word in words:
    # Create a new code cell
    new_cell = nbformat.v4.new_code_cell(f'neighbour_path("{word}")')
    
    # Append the new cell to the notebook
    notebook.cells.append(new_cell)

# Save the updated notebook
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbformat.write(notebook, f)

Extract entries from dictionary .mdx file

In [55]:
from readmdict import MDX, MDD
from bs4 import BeautifulSoup
from collections import defaultdict

filename = '../../corpus/dict.mdx'

# Initialize the dictionary to store the key and list of values
items_dict = defaultdict(list)

# Iterate over each key-value pair in items
for key, value in MDX(filename).items():
    # Append the value to the list for each key
    items_dict[key].append(value.decode())

# Convert defaultdict back to a normal dictionary (optional)
items = dict(items_dict)

def get_entry(word):
    key = word.encode()
    if key not in items.keys():
        return '-'
    
    cleaned_text = []

    for text in items[key]:
        # Parse the HTML content
        soup = BeautifulSoup(text, 'html.parser')

        # Find the main tag
        main_content = soup.find('main')

        # Remove the title and metadata sections
        if main_content.find('span', class_='title'):
            main_content.find('span', class_='title').decompose()
        if main_content.find('div', class_='metadata'):
            main_content.find('div', class_='metadata').decompose()

        # Check if 'lya' exists
        lya_element = main_content.find('span', class_='lya')
        if lya_element:
            lya_text = lya_element.get_text(strip=True) + ' '
            lya_element.decompose()  # Remove 'lya' from the remaining content
        else:
            lya_text = ''

        # Extract the remaining content and join them into a single string
        cleaned_text.append(lya_text + ''.join(main_content.stripped_strings))

    return ''.join(cleaned_text)

In [56]:
# Read the filtered words
words_file_path = 'filtered_words.txt'
dict_entry_path = 'dict_entry.txt'
with open(words_file_path, 'r', encoding='utf-8') as f:
    words = [line.strip() for line in f]

with open(dict_entry_path, 'w', encoding='utf-8') as file:
    for word in words:
        entry = get_entry(word)
        file.write(entry + '\n')