Merge 1-year frequency files to 5-year.

In [2]:
import os
import pandas as pd

# Folder paths
input_folder = '../corpus/frequency'
output_folder = '../corpus/freq-5'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Dictionary to store data frames for each year
data_frames = {}

# Read each year's file
for year in range(1946, 2024):
    file_path = os.path.join(input_folder, f'{year}.txt')
    if os.path.isfile(file_path):
        # Read the file and convert data to DataFrame
        df = pd.read_csv(file_path, sep=' ', header=None, \
            names=['word', 'count'], dtype={'word': str, 'count': int})
        data_frames[year] = df

# Combine data by five-year periods
for start_year in range(1945, 2025, 5):
    end_year = start_year + 4
    combined_df = pd.DataFrame()

    # Merge data for all years in the specified period
    for year in range(start_year, end_year + 1):
        if year in data_frames:
            combined_df = combined_df.append(data_frames[year])

    # Group by 'word' and sum up the counts
    combined_df = combined_df.groupby('word', as_index=False).sum()
    # Sort the combined data by count in descending order
    combined_df = combined_df.sort_values(by='count', ascending=False)

    # Save the combined data to a file
    output_file = os.path.join(output_folder, f'{start_year}-{end_year}.txt')
    combined_df.to_csv(output_file, sep=' ', index=False, header=False)

In [4]:
word_all = []
word_list = []
countfile = '../corpus/freq-5/1945-1949.txt'

with open(countfile, encoding='utf-8') as cf:
    for line in cf:
        split_line = line.split()
        word_all.append(split_line[0])
        if int(split_line[1]) > 100:
            word_list.append(split_line[0])
        else:
            break

for i in range(1950, 2025, 5):
    temp_all = []
    temp_list = []
    countfile = '../corpus/freq-5/' + str(i) + '-' + str(i+4) +'.txt'
    with open(countfile, encoding='utf-8') as cf:
        for line in cf:
            split_line = line.split()
            temp_all.append(split_line[0])
            if int(split_line[1]) > 100:
                temp_list.append(split_line[0])
            else:
                break
    
    word_all = list(set(word_all) | set(temp_all))
    word_list = list(set(word_list) & set(temp_list))

f = open('../corpus/vocab_filter.txt','w',encoding='utf-8')
for word in word_list:
    f.write(word + " ")
f.close()

af = open('../corpus/vocab_all.txt','w',encoding='utf-8')
for word in word_all:
    af.write(word + " ")
af.close()

In [5]:
print("Total word count (every word that ever occurred):", len(word_all))
print("Filtered word count (occurring more than 100 times every 5 years):", len(word_list))

Total word count (every word that ever occurred): 70641
Filtered word count (occurring more than 100 times every 5 years): 5983


In [4]:
import re

non_chinese = [word for word in word_all if re.search('[^\u4e00-\u9fa5]', word)]

nf = open('../corpus/vocab_nonch.txt','w',encoding='utf-8')
for word in non_chinese:
    nf.write(word + " ")
nf.close()

In [5]:
print("Non-Chinese word count:", len(non_chinese))

Non-Chinese word count: 39235


We also look at the vocabulary size for trained Word2Vec models.

In [6]:
from gensim.models.word2vec import Word2Vec

model_path = '../alignment/1-year/model-sgns/2023.model'
model = Word2Vec.load(model_path)
print('Common vocabulary 1946-2023: ' + str(len(model.wv.vocab)))

vf = open('../corpus/vocab_common.txt','w',encoding='utf-8')
for word in model.wv.vocab:
    vf.write(word + " ")
vf.close()

Common vocabulary 1946-2023: 7865
