# 讀取字典

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
filepath = '/Volumes/backup_128G/z_repository/Yumin_data/玉敏_俄羅斯課本的研究'

chinese8k_dic_file = '華語八千詞(內含注音字型檔)/Chinese_8000W_20190515_v1.xlsx'
hsk_dic_file = 'HSK-2012_5000W_20190517_v1.xlsx'
book_file = '實用漢語教科書2010_生詞表.xlsx'

read_chinese8k_dic = '{0}/{1}'.format(filepath, chinese8k_dic_file)
read_hsk_dic = '{0}/{1}'.format(filepath, hsk_dic_file)
read_book_file = '{0}/{1}'.format(filepath, book_file)

# Chinese_8000W

In [3]:
dicC8kDf = pd.DataFrame()

with pd.ExcelFile(read_chinese8k_dic) as reader:
    # read sheet by sheet
    for sheet in reader.sheet_names:
#         print(sheet)
        sheetDf = pd.read_excel(reader, sheet, header=None)
        sheetDf = sheetDf.fillna(0)

        dicC8kDf = dicC8kDf.append(sheetDf, ignore_index=True)

# change to lowercase
len(dicC8kDf.index)

8099

In [4]:
dicC8kDf.head()

Unnamed: 0,0,1
0,我,1
1,你,1
2,妳,1
3,他,1
4,她,1


# HSK-2012_5000W

In [5]:
with pd.ExcelFile(read_hsk_dic) as reader:
    dicH5kDf = pd.read_excel(reader, 'HSK5000')

len(dicH5kDf.index)

5000

In [6]:
dicH5kDf.head()

Unnamed: 0,Level,WordCn,WordTw
0,1,爱,愛
1,1,八,八
2,1,爸爸,爸爸
3,1,杯子,杯子
4,1,北京,北京


# 讀取待分析檔

In [7]:
bookDf = pd.read_excel(read_book_file)

In [8]:
bookDf.head()

Unnamed: 0,課別,A:主要詞彙B:補充詞彙,每課生詞排序,生詞,Unnamed: 4,解釋,Russian Explain,wordtc,wordsc,engexample,rusexample
0,1,A,2,好,,"good, well, easy to, very",хороший; хорошо,,,,
1,1,A,1,你,,you,ты,,,,
2,2,A,3,很,,very,очень,,,,
3,2,A,1,嗎,,"(question particle for ""yes-no"" question",вопросительная частица,,,,
4,2,A,4,呢,,particle indicating that a previously asked qu...,модальная частица,,,,


In [9]:
bookWordsDf = bookDf['生詞']

In [10]:
bookWordsDf.head()

0    好
1    你
2    很
3    嗎
4    呢
Name: 生詞, dtype: object

In [11]:
print(len(bookWordsDf[bookWordsDf == '色粉']))

0


# Start analysize

In [12]:
# Chinese 8k 前三級
lowerC8kDf = dicC8kDf[dicC8kDf[1] <= 3]
print(len(lowerC8kDf), lowerC8kDf.head())

532    0  1
0  我  1
1  你  1
2  妳  1
3  他  1
4  她  1


In [13]:
# HSK 4k 前三級
lowerH5kDf = dicH5kDf[dicH5kDf['Level'] <= 3]
print(len(lowerH5kDf), lowerH5kDf.head())

600    Level WordCn WordTw
0      1      爱      愛
1      1      八      八
2      1     爸爸     爸爸
3      1     杯子     杯子
4      1     北京     北京


In [14]:
c8kNotIn = []
c8kIn = []
for idx in range(0, len(lowerC8kDf)):
    row = lowerC8kDf.loc[idx]
    
    word = row[0]
    level = row[1]
    
    if len(bookWordsDf[bookWordsDf == word]) == 0:
#         print(word)
        if word not in c8kNotIn:
            c8kNotIn.append([word, level])
#         else:
#             print("DUP!!", word)
    else:
#         print(word)
        if word not in c8kIn:
            c8kIn.append([word, level])

print(len(lowerC8kDf), len(c8kNotIn), len(c8kIn))    

532 122 410


In [15]:
h5kNotIn = []
h5kIn = []
for idx in range(0, len(lowerH5kDf)):
    row = lowerH5kDf.loc[idx]
    
    word = row['WordTw']
    level = row['Level']
    
    if len(bookWordsDf[bookWordsDf == word]) == 0:
#         print(word)
        if word not in h5kNotIn:
            h5kNotIn.append([word, level])
#         else:
#             print("DUP!!", word)
    else:
#         print(word)
        if word not in h5kIn:
            h5kIn.append([word, level])

print(len(lowerH5kDf), len(h5kNotIn), len(h5kIn))   

600 137 463


In [16]:
c8kNotInDf = pd.DataFrame(c8kNotIn)
# c8kNotInDf = c8kNotInDf.sort_values(by=[1])
c8kNotInDf.head()

Unnamed: 0,0,1
0,妳,1
1,妳們,1
2,中國,1
3,美國,1
4,日本,1


In [17]:
c8kInDf = pd.DataFrame(c8kIn)
# c8kInDf = c8kInDf.sort_values(by=[1])
c8kInDf.head()

Unnamed: 0,0,1
0,我,1
1,你,1
2,他,1
3,她,1
4,我們,1


In [18]:
h5kNotInDf = pd.DataFrame(h5kNotIn)
h5kNotInDf.head()

Unnamed: 0,0,1
0,愛,1
1,杯子,1
2,北京,1
3,不客氣,1
4,出租車,1


In [19]:
h5kInDf = pd.DataFrame(h5kIn)
h5kInDf.head()

Unnamed: 0,0,1
0,八,1
1,爸爸,1
2,本,1
3,不,1
4,菜,1


In [20]:
headers = ['word', 'level']
write_to_file = '{0}/{1}'.format(filepath, "processed/C8k_H5K_notIn_20190518_v2.xlsx")
with pd.ExcelWriter(write_to_file) as writer:
    c8kNotInDf.to_excel(writer, 'c8kNotIn', index=False, header=headers)
    c8kInDf.to_excel(writer, 'c8kIn', index=False, header=headers)
    h5kNotInDf.to_excel(writer, 'h5kNotIn', index=False, header=headers)
    h5kInDf.to_excel(writer, 'h5kIn', index=False, header=headers)

    writer.save()