forked from temach/jplang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test-free-word.py
59 lines (47 loc) · 1.83 KB
/
test-free-word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from pprint import pprint
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
new_mnemonic = """
Stop for a second or the egg will hatching in eights
"""
# the list of words we are allowed to use
#allwords_path = "google-10000-english.txt"
# lists of words that are currently reserved
# words used in anki deck (keywords and mnemonics)
ankiwords_path = "artem-kanji-deck-test-export.txt"
# separately load the list of onyomi
onyomiwords_path = "onyomi-mnemonics.txt"
# either we are checking if keyword is free
# or we are checking if mnemonic is free
word_regex = r'[A-Za-z0-9_]{2,}'
class TakenWord(object):
def __init__(self, stem, metainfo):
self.stem = stem
self.context = metainfo
def main():
with open(ankiwords_path, "r") as anki_file, open(onyomiwords_path, "r") as onyomi_file:
anki_cards = anki_file.readlines()
onyomi = onyomi_file.readlines()
# learn words that are bad to use in mnemonics
bad_mnemonics = []
for line in anki_cards:
kanji, keyword, mnemonic = line.split("\t")
key_stem = stemmer.stem(keyword)
taken = TakenWord(key_stem, "AnkiCard {}".format(line))
bad_mnemonics.append(taken)
for line in onyomi:
on = line.split()[0]
stem = stemmer.stem(on)
taken = TakenWord(stem, "ONyomi {}".format(line))
bad_mnemonics.append(taken)
# find if new mnemonic is bad
nltk_tokens = nltk.word_tokenize(new_mnemonic)
for word in nltk_tokens:
stem = stemmer.stem(word)
for taken in bad_mnemonics:
if taken.stem == stem:
print("{:15} is matched with {}".format(word, taken.context))
if __name__ == "__main__":
main()