In [6]:
'''
依赖 anki-connect 项目提供的接口：https://github.com/FooSoft/anki-connect
'''
import json
import urllib.request


def request(action, **params):
    return {'action': action, 'params': params, 'version': 6}


def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
    response = json.load(
        urllib.request.urlopen(
            urllib.request.Request('http://localhost:8765', requestJson)))
    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')
    if 'error' not in response:
        raise Exception('response is missing required error field')
    if 'result' not in response:
        raise Exception('response is missing required result field')
    if response['error'] is not None:
        raise Exception(response['error'])
    return response['result']

print(invoke('deckNamesAndIds'))

{'120个常见英语词根': 1639269089730, 'COCA-高频词汇(OTF)': 1636003078736, 'COCA-高频词汇(OTF)::coca ~ 2000': 1636003078737, 'COCA-高频词汇(OTF)::coca ~ 4000': 1636003078828, 'COCA-高频词汇(OTF)::coca ~ 6000': 1636003078938, 'TOEFL词汇词根+联想记忆法：乱序版': 1639269105629, 'video_words': 1638602251911, '增量阅读': 1639270047389, '默认': 1}


In [7]:
card_id_list = invoke('findCards', **{"query": r'deck:"COCA-高频词汇(OTF)"'})
card_info_list = invoke('cardsInfo', **{"cards": card_id_list})

In [8]:
word_list = [card_info["fields"]["word"]["value"] for card_info in card_info_list]
word_set = set(word.lower() for word in word_list)

In [9]:
# 读取 coca 60000
import pandas as pd
coca_60000_df = pd.read_excel('miscs/COCA60000.xlsx')  

In [10]:
coca_60000_df.columns

Index(['RANK #', 'PoS', '#WORD', 'TOTAL', 'SPOKEN', 'FICTION', 'MAGAZINE',
       'NEWSPAPER', 'ACADEMIC'],
      dtype='object')

In [11]:
columns = ['TOTAL', 'SPOKEN', 'FICTION', 'MAGAZINE','NEWSPAPER', 'ACADEMIC']
rank_columns = ['RANK #']
for column in ['TOTAL', 'SPOKEN', 'FICTION', 'MAGAZINE','NEWSPAPER', 'ACADEMIC']:
    rank_column = f'{column}_RANK'
    rank_columns.append(rank_column)
    coca_60000_df[rank_column] = coca_60000_df[column].rank(ascending=False)

In [14]:
cnt = 0
def is_not_old_words_filter(row) -> bool:
    if row["#WORD"].lower() in word_set:
        return False
    return True
    
coca_60000_df["#WORD"] = coca_60000_df["#WORD"].astype('str')
coca_60000_df_unmemorized = coca_60000_df[coca_60000_df.apply(is_not_old_words_filter, axis=1)]
new_2000_words_df = coca_60000_df_unmemorized[coca_60000_df_unmemorized["RANK #"] > 6000].iloc[:2000, :]


In [None]:
# 依赖项目 https://github.com/skywind3000/ecdict
# 放在 miscs/ECDICT 下
from miscs.ECDICT.stardict import LemmaDB, DictCsv

lemma = LemmaDB()
lemma.load('miscs/ECDICT/lemma.en.txt')
ecdict = DictCsv('miscs/ECDICT/ecdict.csv')

In [48]:
word_info_tuple_list = []

for _, row in new_2000_words_df.iterrows():
    # info_list = [f"PoS: {row['PoS']}"]
    info_list = []
    for column in rank_columns:
        info_list.append(f"{column.lower().strip('# ')}: {row[column]}")
    info = ' | '.join(info_list)
    word = row["#WORD"].lower()
    result = ecdict.query(word)
    if result is None:
        print(f"没有查到 {word}")
    else:
        word_info_tuple_list.append((word, info, result))

没有查到 three-year
没有查到 nineteenth-century
没有查到 sauté


In [49]:
len(word_info_tuple_list)

1997

In [50]:
word_info_tuple_list[:5]

[('postwar',
  'rank: 6006 | total_rank: 6223.0 | spoken_rank: 7280.5 | fiction_rank: 15495.0 | magazine_rank: 6326.0 | newspaper_rank: 6396.5 | academic_rank: 3702.0',
  {'id': 546268,
   'sw': 'postwar',
   'word': 'postwar',
   'phonetic': "'pәust'wɒ:",
   'definition': 'a. belonging to the period after a war',
   'translation': 'a. 战后的',
   'pos': '',
   'collins': 2,
   'oxford': 0,
   'tag': 'toefl',
   'bnc': 9300,
   'frq': 5928,
   'exchange': '',
   'detail': None,
   'audio': ''}),
 ('good-bye',
  'rank: 6037 | total_rank: 3062.0 | spoken_rank: 2365.0 | fiction_rank: 1291.0 | magazine_rank: 3683.0 | newspaper_rank: 3892.0 | academic_rank: 10146.0',
  {'id': 300188,
   'sw': 'goodbye',
   'word': 'good-bye',
   'phonetic': "'^ud'bai",
   'definition': 'n a farewell remark',
   'translation': 'interj. 再见, 再会\nn. 告别',
   'pos': '',
   'collins': 0,
   'oxford': 0,
   'tag': '',
   'bnc': 0,
   'frq': 0,
   'exchange': 's:good-byes',
   'detail': None,
   'audio': ''}),
 ('socio

In [46]:
invoke('createDeck', **{"deck": r'"coca ~ 8000"'})

1641056078057

In [51]:
with open("miscs/words.txt", 'w') as f:
    for word_info_tuple in word_info_tuple_list:
        audio_url = "https://dict.youdao.com/dictvoice?audio=" + word_info_tuple[0]
        info_list = [word_info_tuple[0],
                     word_info_tuple[2]['translation'],
                     word_info_tuple[2]['phonetic'],
                     audio_url,
                     word_info_tuple[2]['definition'],
                     word_info_tuple[2]['tag'],
                     word_info_tuple[2]['frq'],
                     word_info_tuple[2]['bnc'],
                     word_info_tuple[2]['exchange'],
                     word_info_tuple[1],
                    ]
        f.writelines('\t'.join(str(i).replace('\n', ' ') for i in info_list) + '\n')