In [1]:
import pprint
import pandas
import sqlite3
processed_cedict = []
with open('cedict_1_0_ts_utf-8_mdbg.txt', 'r', encoding='utf-8') as f:
    for line in f:
        if line.startswith('#'):
            continue
        line = line.strip()
        if not line:
            continue
        # space, space, bracket, slash
        pinyin = line.split('[', 1)[1].split(']', 1)[0]
        assert all(
            c in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZüÜ:,· ' for c in pinyin)
        english = line.split('/', 1)[1]
        trad, simp = line.split(' ', 2)[:2]
        pinyin = pinyin.replace('u:', 'ü')
        pinyin = pinyin.replace('U:', 'Ü')
        english = english.strip('/')
        # english = english.replace('/', ' * ')
        processed_cedict.append((trad, simp, pinyin, english))
# pprint.pprint(processed_cedict)
df = pandas.DataFrame(processed_cedict, columns=[
                      'Traditional', 'Simplified', 'Pinyin', 'English'])


In [6]:
def is_chinese(word):
    return all(u'\u4e00' <= char <= u'\u9fff' for char in word)


def is_english(word):
    return all(u'\u0041' <= char <= u'\u005a' or u'\u0061' <= char <= u'\u007a' for char in word)


def has_punctuation(word):
    return any(char in '.,?!:;-/' for char in word)


def any_chinese(word):
    return any(u'\u4e00' <= char <= u'\u9fff' for char in word)


non_chinese_trad = df[~df['Traditional'].apply(is_chinese)]
non_chinese_simp = df[~df['Simplified'].apply(is_chinese)]
non_english = df[~df['English'].apply(is_english)]
pinyin_chinese = df[df['Pinyin'].apply(any_chinese)]

only_english_punctuation = df[~df['Traditional'].apply(has_punctuation) & ~df['Simplified'].apply(
    has_punctuation) & ~df['Pinyin'].apply(has_punctuation)]


In [7]:
print('Total:', len(df))
print('Non-Chinese Traditional:', len(non_chinese_trad))
print('Non-Chinese Simplified:', len(non_chinese_simp))
print('Non-English:', len(non_english))
print('Pinyin Chinese:', len(pinyin_chinese))
print('Only English Punctuation:', len(only_english_punctuation))


Total: 121606
Non-Chinese Traditional: 1068
Non-Chinese Simplified: 1271
Non-English: 111848
Pinyin Chinese: 0
Only English Punctuation: 121143


In [8]:
print(non_chinese_trad)
non_chinese_trad.to_csv('non_chinese_trad.csv', index=False)


       Traditional Simplified  \
0                %          %   
1        2019冠狀病毒病  2019冠状病毒病   
2          21三體綜合症    21三体综合症   
3               3C         3C   
4               3P         3P   
...            ...        ...   
121601           𨭆          𬭶   
121602           𨭎          𬭳   
121603           𩧢          𱅒   
121604           𰻞          𰻝   
121605         𰻞𰻞麵        𰻝𰻝面   

                                                  Pinyin  \
0                                                    pa1   
1       er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4   
2                 er4 shi2 yi1 san1 ti3 zong1 he2 zheng4   
3                                                 san1 C   
4                                                 san1 P   
...                                                  ...   
121601                                              hei1   
121602                                               xi3   
121603                                            cheng3   
121

In [40]:
df.to_json('cedict.json', orient='records', force_ascii=False)


In [2]:
# create sqlite database
conn = sqlite3.connect('cedict.db')
c = conn.cursor()


In [49]:
# drop all tables
c.execute('DROP TABLE IF EXISTS cedict;')
c.execute('DROP TABLE IF EXISTS cedict_lookup;')
conn.commit()

In [50]:
c.execute('''CREATE TABLE cedict (
    ID INTEGER PRIMARY KEY AUTOINCREMENT, 
    Traditional TEXT, 
    Simplified TEXT, 
    Pinyin TEXT, 
    English TEXT)''')
conn.commit()
df.to_sql('cedict', conn, if_exists='append', index=False)


In [51]:
c.execute('''CREATE TABLE cedict_lookup (
    id INTEGER PRIMARY KEY AUTOINCREMENT, 
    Lookup TEXT, 
    cedict_id INTEGER,
    UNIQUE(Lookup, cedict_id),
    FOREIGN KEY(cedict_id) REFERENCES cedict(ID))''')
c.execute('CREATE INDEX lookup_index ON cedict_lookup(Lookup)')
conn.commit()


In [52]:
cedict_query = 'INSERT INTO cedict_lookup (Lookup, cedict_id) VALUES (?, ?)'
cedict_query += ' ON CONFLICT (Lookup, cedict_id) DO NOTHING'
print(cedict_query)
for row in c.execute('SELECT ID, Traditional, Simplified FROM cedict').fetchall():
    c.execute(cedict_query, (row[1], row[0]))
    c.execute(cedict_query, (row[2], row[0]))
conn.commit()


INSERT INTO cedict_lookup (Lookup, cedict_id) VALUES (?, ?) ON CONFLICT (Lookup, cedict_id) DO NOTHING


In [5]:
SQL_query = pandas.read_sql_query(
    'SELECT * FROM cedict WHERE Simplified = "的"', conn)


In [None]:
def convert_windows_path_to_unix(path):
    return path.replace('\\', '/')


path = r"""

"""
path = convert_windows_path_to_unix(path)
print(path)


In [53]:
import sqlite3


def check_db(path):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    c.execute('SELECT * FROM cedict WHERE Simplified = "的"')
    print(c.fetchall())
    c.execute('SELECT * FROM cedict_lookup WHERE Lookup = "的"')
    print(c.fetchall())


check_db('cedict.db')


[(75089, '的', '的', 'de5', "of; ~'s (possessive particle)/(used after an attribute)/(used to form a nominal expression)/(used at the end of a declarative sentence for emphasis)/also pr. [di4] or [di5] in poetry and songs"), (75090, '的', '的', 'di1', 'see 的士[di1 shi4]'), (75091, '的', '的', 'di2', 'really and truly'), (75092, '的', '的', 'di4', "(bound form) bull's-eye; target")]
[(150177, '的', 75089), (150179, '的', 75090), (150181, '的', 75091), (150183, '的', 75092)]
