In [1]:
'''
get word list from wordlist/COCA/*
the corpus is from https://www.corpusdata.org/formats.asp

wordlist['States']=='state'
'''

import os
from tqdm import tqdm
from chardet.universaldetector import UniversalDetector

check_string="0123456789,!@#$%^*()-=+_[]{}|:;<>?"
wordlist={}

detector = UniversalDetector()
def get_encoding(filename: str):
    detector.reset()
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']

def get_wordlist_filepath():
    file_path=[]
    for dname in os.listdir('wordlist'):
        if '.' in dname: continue
        dpath=os.path.join('wordlist', dname)
        for fname in os.listdir(dpath):
            file_path.append(os.path.join(dpath, fname))
    return file_path

def check_word_validate(word: str) -> bool:
    for c in check_string:
        if c in word: return False
    return True
        
for fpath in tqdm(get_wordlist_filepath()):
    words=open(fpath, 'r', encoding=get_encoding(fpath)).read()
    words=words[words.find('\n')+1:]
    words=[x.split('\t') for x in words.split('\n')]
    
    # data sample:
    # 'States' 'state' 'PoS'
    for w in words:
        if len(w)<2: continue
        if len(w[0])<3 or len(w[0])<3: continue
        if not check_word_validate(w[0]): continue
        
        w[0]=w[0].lower()
        wordlist[w[0]]=w[1]
        wordlist[w[1]]=w[1]

100%|██████████| 115/115 [00:06<00:00, 17.90it/s]


In [2]:
'''
get word frequence
'''

word_freq={}
wordset=set(wordlist.keys())

def get_sample_filepath():
    file_path=[]
    for fname in os.listdir('sample'):
        file_path.append(os.path.join('sample', fname))
    return file_path

for fpath in tqdm(get_sample_filepath()):
    text=open(fpath, 'r', encoding=get_encoding(fpath)).read().lower()
    for w in wordset:
        ow=wordlist[w]
        if ow not in word_freq:
            word_freq[ow]=0
        word_freq[ow]+=text.count(w)

In [None]:
del word_freq['']
pkl.dump(wordlist, open('data/wordlist.pkl', 'wb'))
pkl.dump(word_freq, open('data/word_freq.pkl', 'wb'))

In [None]:
from functools import reduce

info={
    'total_word_count': reduce(lambda a, b: a+b, word_freq.values()),
    'word_kind_count': len(word_freq),
}

pkl.dump(info, open('data/info.pkl', 'wb'))

In [None]:
'''
get chinese middle school words, used to filter easy words
'''

import requests as req
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
import pickle as pkl

url='https://www.koolearn.com/dict/tag_%d_%d.html'

def getWordsByUrl(url: str) -> list:
    html=req.get(url)
    soup=BS(html.content, 'lxml')

    words=soup.find_all('div', class_='left-content')[0] \
        .find_all('a', class_='word')
    words=[x.text for x in words]
    return words

words=[]
for page in tqdm(range(423, 448+1)):
    detail=0
    while True:
        detail+=1
        new_words=getWordsByUrl(url%(page, detail))
        if len(new_words)==0:
            break
        words+=new_words

print("Done, total", len(words), "words")
words=list(set(words))
pkl.dump(words, open('data/middle_school_words.pkl', 'wb'))
