In [11]:
import os
import re
import jieba

stopfile = '../../corpus/stopwords.txt'
sentdir = '../../corpus/sent/'
yearlydir = '../../corpus/1-year/'
frequencydir = '../../corpus/frequency/'
wordcountdir = '../../corpus/wordcount.txt'

with open(stopfile, 'r', encoding='utf-8') as sf:
    stopword_list = [word.strip('\n') for word in sf.readlines()]

for newdir in [yearlydir, frequencydir]:
    if not os.path.exists(newdir):
        os.makedirs(newdir)

In [14]:
def segment_and_write(inputdir, outputdir, countdir):
    counts = {}
    total_word_count = 0
    with open(inputdir, 'r', encoding='utf-8') as infile, open(outputdir, 'w', encoding='utf-8') as outfile:
        for line in infile:
            sentence = line.strip()
            if sentence:
                temp = ""
                for word in jieba.lcut(sentence):
                    if not (re.search('[^\.\-0-9a-zA-Z\u4e00-\u9fa5]+', word) or re.match('[\.\-\d]+', word)):
                        if word not in stopword_list:
                            temp += word + " "
                            total_word_count += 1
                            if word in counts.keys():
                                counts[word] = counts[word] + 1
                            else:
                                counts[word] = 1
                temp = temp.strip()
                if temp:
                    outfile.write(temp + '\n')
    
    counts_list = list(counts.items())
    counts_list.sort(key=lambda x:x[1], reverse=True)

    with open(countdir, 'w', encoding='utf-8') as countfile:
        for (word, freq) in counts_list:
            word_freq = word + ' ' + str(freq) + '\n'
            countfile.write(word_freq)
    
    return total_word_count


def process_all_years(input_folder, output_folder, count_folder):
    year_count = {}
    for filename in os.listdir(input_folder):
        year = filename.split('.')[0]
        input_file = os.path.join(input_folder, filename)
        output_file = os.path.join(output_folder, f'{year}.txt')
        count_file = os.path.join(count_folder, f'{year}.txt')
        year_count[year] = segment_and_write(input_file, output_file, count_file)
    
    with open(wordcountdir, 'w', encoding='utf-8') as wordcountfile:
        for year, count in year_count.items():
            wordcountfile.write(year + ' ' + str(count) + '\n')

In [15]:
process_all_years(sentdir, yearlydir, frequencydir)