# 讀取字典

In [1]:
import pandas as pd
import os

In [2]:
filepath = '/Volumes/backup_128G/z_repository/Yumin_data/玉敏_俄羅斯課本的研究'

file_dic = '華語八千詞(內含注音字型檔)/Chinese_8000W_20190515_v1.xlsx'
book_file = '實用漢語教科書2010_生詞表.xlsx'
to_file = 'processed/chinese_8000Words.xlsx'

# write_level_doc = '{0}/{1}'.format(filepath, to_level_doc)
read_dic = '{0}/{1}'.format(filepath, file_dic)
read_book = '{0}/{1}'.format(filepath, book_file)
write_file = '{0}/{1}'.format(filepath, to_file)

In [3]:
dicDf = pd.DataFrame()

with pd.ExcelFile(read_dic) as reader:
    # read sheet by sheet
    for sheet in reader.sheet_names:
#         print(sheet)
        sheetDf = pd.read_excel(reader, sheet, header=None)
        sheetDf = sheetDf.fillna(0)

        dicDf = dicDf.append(sheetDf, ignore_index=True)

# change to lowercase
len(dicDf.index)

8099

In [4]:
dicDf.head()

Unnamed: 0,0,1
0,我,1
1,你,1
2,妳,1
3,他,1
4,她,1


In [5]:
dicList = {}
for idx in range(0, len(dicDf)):
    row = dicDf.loc[idx]
    
    dicWord = row[0]
    dicLevel = row[1]

    if dicWord not in dicList:
        dicList[dicWord] = [dicLevel]
    else:
#         print(dicWord, dicLevel)
        dicList[dicWord].append(dicLevel)
# dicList

# 讀取待分析檔

In [6]:
bookDf = pd.read_excel(read_book)

In [7]:
bookDf.head()

Unnamed: 0,課別,A:主要詞彙B:補充詞彙,每課生詞排序,生詞,Unnamed: 4,解釋,Russian Explain,wordtc,wordsc,engexample,rusexample
0,1,A,2,好,,"good, well, easy to, very",хороший; хорошо,,,,
1,1,A,1,你,,you,ты,,,,
2,2,A,3,很,,very,очень,,,,
3,2,A,1,嗎,,"(question particle for ""yes-no"" question",вопросительная частица,,,,
4,2,A,4,呢,,particle indicating that a previously asked qu...,модальная частица,,,,


In [8]:
wordDifferentLevel = []

def wordLevel(word):
    foundLevel = 9
    if word in dicList:
        foundLevel = dicList[word][0]
        
    return foundLevel

In [9]:
levelList = []

for idx in range(0, len(bookDf)):
    row = bookDf.loc[idx]
    
    chapter = row[0]
    wtype = row[1]
    word = row[3]
    
    level = wordLevel(word)
    levelList.append([word, level, wtype, chapter])
#     print(chapter, wtype, word)

In [10]:
levelDf = pd.DataFrame(levelList)
levelDf = levelDf.sort_values(by=[1, 3, 2, 0])
levelDf.head()

Unnamed: 0,0,1,2,3
1,你,1,A,1
0,好,1,A,1
6,也,1,A,2
4,呢,1,A,2
3,嗎,1,A,2


In [11]:
# levelDf.loc[levelDf[2] == 'A']

In [12]:
# levelDf.loc[levelDf[2] == 'B']

In [13]:
levelDf[~levelDf[2].isin(['A', 'B'])]

Unnamed: 0,0,1,2,3


In [14]:
def statsLevel(INdf):
    levelCountList = []
    for level in range(1, 10):
        levelCount = INdf[1].loc[INdf[1] == level].count()
        levelCountList.append(levelCount)

    levelCountDf = pd.DataFrame(levelCountList)
    return levelCountDf

headers = ['Word', 'Level', 'A/B', 'Lesson']
with pd.ExcelWriter(write_file) as writer:
    # 1.列出每一個詞的等級
    levelDf.to_excel(writer, 'All', index=False, header=headers)

    # 2.統計每一個等級共有多少字
    levels = list(range(1, 10))
    levelCountDf = pd.DataFrame(levels)
    ##   A.主要詞彙的統計
    major = levelDf.loc[levelDf[2] == 'A']
    levelCountDf['A'] = statsLevel(major)
    ##   B.補充詞彙的統計
    minor = levelDf.loc[levelDf[2] == 'B']
    levelCountDf['B'] = statsLevel(minor)
    ##   C.主要詞彙+補充詞彙的統計
    levelCountDf['A/B'] = statsLevel(levelDf)
    
    levelCountDf.to_excel(writer, 'Stats', index=False, header=['Level', 'A', 'B', 'A/B'])

    # 3.列出不在8000詞的生詞有哪些
    wordsNotIn = levelDf.loc[levelDf[1] == 9]
    wordsNotInDf = pd.DataFrame(wordsNotIn)
    
    wordsNotInDf.to_excel(writer, 'WordsNotIn', index=False, header=headers)

    writer.save()