In [1]:
import os
import re
import jieba

stopfile = '../../corpus/stopwords.txt'
sentdir = '../../corpus/sent/'
yearlydir = '../../corpus/1-year/'
frequencydir = '../../corpus/frequency/'

with open(stopfile, 'r', encoding='utf-8') as sf:
    stopword_list = [word.strip('\n') for word in sf.readlines()]

for newdir in [yearlydir, frequencydir]:
    if not os.path.exists(newdir):
        os.makedirs(newdir)

In [7]:
def segment_and_write(inputdir, outputdir, countdir):
    counts = {}
    with open(inputdir, 'r', encoding='utf-8') as infile, open(outputdir, 'w', encoding='utf-8') as outfile:
        for line in infile:
            sentence = line.strip()
            if sentence:
                words = jieba.lcut(sentence)
                temp = ""
                for word in words:
                    word = word.strip()
                    if not (re.search('[^\.\-0-9a-zA-Z\u4e00-\u9fa5]+', word) or re.match('[\.\-\d]+', word)):
                        if word not in stopword_list:
                            temp += word + " "
                            if word in counts.keys():
                                counts[word] = counts[word] + 1
                            else:
                                counts[word] = 1
                temp = temp.rstrip()
                if temp:
                    outfile.write(temp + '\n')
    
    counts_list = list(counts.items())
    counts_list.sort(key=lambda x:x[1], reverse=True)

    with open(countdir, 'w', encoding='utf-8') as countfile:
        for (word, freq) in counts_list:
            word_freq = word + ' ' + str(freq) + '\n'
            countfile.write(word_freq)


def process_all_years(input_folder, output_folder, count_folder):
    for filename in os.listdir(input_folder):
        year = filename.split('.')[0]
        input_file = os.path.join(input_folder, filename)
        output_file = os.path.join(output_folder, f'{year}.txt')
        count_file = os.path.join(count_folder, f'{year}.txt')
        segment_and_write(input_file, output_file, count_file)

In [8]:
process_all_years(sentdir, yearlydir, frequencydir)

In [7]:
for i in range(1946, 2024):

    repr = str(i)
    # print("Processing: " + repr + "...")

    rootdir = rawdir + repr + '年'
    newfile = yearlydir + repr + '.txt'
    countfile = frequencydir + repr + '.txt'

    paths = []
    counts = {}

    for root, dirs, files in os.walk(rootdir):
        for file in files:
            paths.append(os.path.join(root, file).encode('utf-8'))

    f = open(newfile,'wb')
    cf = open(countfile, 'wb')

    for i in paths:
        
        fileTrainRead = []
        with open(i, encoding='utf-8') as fileTrainRaw:
            for line in fileTrainRaw:
                fileTrainRead.append(re.sub(r"第.*版\(.*\)专栏：", "", line))

        # Each line a piece of news
        fileTrainSeg = []
        for news in fileTrainRead:
            temp = ""
            for word in jieba.lcut(news):
                if not (re.search('[^\.\-0-9a-zA-Z\u4e00-\u9fa5]+', word) or re.match('[\.\-\d]+', word)):
                    if word not in stopword_list:
                        temp += word + " "
                        if word in counts.keys():
                            counts[word] = counts[word] + 1
                        else:
                            counts[word] = 1
            temp = temp.rstrip()
            if temp:
                fileTrainSeg.append(temp)

        for i in range(len(fileTrainSeg)):
            f.write(fileTrainSeg[i].encode('utf-8'))
            f.write("\n".encode('utf-8'))
    
    counts_list = list(counts.items())
    counts_list.sort(key=lambda x:x[1], reverse=True)

    for (word, freq) in counts_list:
        word_freq = word + ' ' + str(freq) + '\n'
        cf.write(word_freq.encode('utf-8'))

    f.close()
    cf.close()

Processing: 1946...
Processing: 1947...
Processing: 1948...
Processing: 1949...
Processing: 1950...
Processing: 1951...
Processing: 1952...
Processing: 1953...
Processing: 1954...
Processing: 1955...
Processing: 1956...
Processing: 1957...
Processing: 1958...
Processing: 1959...
Processing: 1960...
Processing: 1961...
Processing: 1962...
Processing: 1963...
Processing: 1964...
Processing: 1965...
Processing: 1966...
Processing: 1967...
Processing: 1968...
Processing: 1969...
Processing: 1970...
Processing: 1971...
Processing: 1972...
Processing: 1973...
Processing: 1974...
Processing: 1975...
Processing: 1976...
Processing: 1977...
Processing: 1978...
Processing: 1979...
Processing: 1980...
Processing: 1981...
Processing: 1982...
Processing: 1983...
Processing: 1984...
Processing: 1985...
Processing: 1986...
Processing: 1987...
Processing: 1988...
Processing: 1989...
Processing: 1990...
Processing: 1991...
Processing: 1992...
Processing: 1993...
Processing: 1994...
Processing: 1995...
