In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Data import & cleanse

In [2]:
raw_df = pd.read_csv('dataset/wikihan-romanization.tsv', sep='\t')

In [3]:
raw_df

Unnamed: 0,Character,Middle Chinese (Baxter and Sagart 2014),Cantonese,Gan,Hakka,Jin,Mandarin,Hokkien,Wu,Xiang
0,犬,khwen²,hyun2,qyon3,khién,qye1,quǎn,khián,2qyoe,qye3
1,馬,mæ²,maa5,ma3,mâ,ma2,mǎ,bé/bée/má,3mo/3ma,ma3
2,西,sej¹,sai1,xi1,sî,xi1,xī,sai/se/si,1xi,xi1
3,車,tshyæ¹,ce1/geoi1,ca1,chhâ,ce1,chē,chhia/cha/ki,1tsho,che1
4,車,kjo¹,geoi1,-,kî,-,jū,ku/kir/ki,1jy,-
...,...,...,...,...,...,...,...,...,...,...
21222,𪍑,tshyew²,ciu2,-,-,-,chǎo,-,-,-
21223,𪗋,tsij¹,zi1,-,-,-,zī,-,-,-
21224,𥺊,-,-,-,-,-,gé,-,-,-
21225,𠳀,-,-,-,-,-,yǒng,-,-,-


In [4]:
df2 = raw_df[['Character', 'Middle Chinese (Baxter and Sagart 2014)', 'Cantonese', 
          'Mandarin']]
df2 = df2.rename(columns={'Middle Chinese (Baxter and Sagart 2014)': 'Middle Chinese'})
df2

Unnamed: 0,Character,Middle Chinese,Cantonese,Mandarin
0,犬,khwen²,hyun2,quǎn
1,馬,mæ²,maa5,mǎ
2,西,sej¹,sai1,xī
3,車,tshyæ¹,ce1/geoi1,chē
4,車,kjo¹,geoi1,jū
...,...,...,...,...
21222,𪍑,tshyew²,ciu2,chǎo
21223,𪗋,tsij¹,zi1,zī
21224,𥺊,-,-,gé
21225,𠳀,-,-,yǒng


In [5]:
# Delete rows that are not fully recorded
def drop_missing_raws(df):
    df = df.replace('-', pd.NA)
    row_missing_values = df.isnull().sum(axis=1)
    fully_filled_rows = row_missing_values[row_missing_values == 0]
    df = df.iloc[fully_filled_rows.index]
    df = df.reset_index(drop=True)
    return df

df2 = drop_missing_raws(df2)
df2

Unnamed: 0,Character,Middle Chinese,Cantonese,Mandarin
0,犬,khwen²,hyun2,quǎn
1,馬,mæ²,maa5,mǎ
2,西,sej¹,sai1,xī
3,車,tshyæ¹,ce1/geoi1,chē
4,車,kjo¹,geoi1,jū
...,...,...,...,...
11760,𧟌,la³,lo6,luò
11761,𧥺,hjin¹,kwan1,jùn/yùn
11762,𩥉,kij³,gei3,jì
11763,𪍑,tshyew²,ciu2,chǎo


In [6]:
# Normalize the table representation
df2['Middle Chinese'][1]

'mæ²'

In [7]:
def get_unique_characters(df):
    unique_characters = {}

    for column in df.columns:
        combined_values = ''.join(df[column].astype(str))
        unique_chars = set(combined_values)
        unique_characters[column] = unique_chars

    return unique_characters

def print_all_chars(df):
    for column, unique_chars in get_unique_characters(df).items():
        print(f'{column}: {", ".join(unique_chars)}')

print(print_all_chars(df2))

Character: 酊, 燃, 仞, 狨, 斛, 聱, 鰇, 抔, 抏, 苨, 羔, 䴳, 唾, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 㔌, 洴, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 㶟, 孩, 榍, 分, 澭, 淃, 睥, 䬾, 詵, 抨, 㑗, 桄, 嗗, 䢼, 拸, 玷, 攽, 蝴, 畛, 糗, 旋, 燎, 瓣, 八, 致, 蔤, 欓, 瓬, 䯬, 紊, 寫, 嚌, 諑, 乖, 付, 剝, 陶, 嬈, 㔂, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 獬, 葎, 䴑, 鳿, 翁, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 悆, 韭, 㒔, 佊, 帛, 躋, 㐆, 遻, 褵, 弛, 灝, 䲯, 廇, 厹, 散, 複, 齾, 檎, 渶, 煠, 巘, 勘, 鯪, 䯊, 䱜, 面, 𦜖, 紵, 㣈, 酡, 冰, 褋, 瑣, 瞭, 屐, 訾, 虩, 孈, 腰, 庨, 彽, 狖, 興, 洊, 衹, 崦, 敔, 庡, 棌, 岈, 鞏, 笅, 嗼, 侶, 婕, 閥, 膙, 杸, 三, 醪, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 癭, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 恚, 霧, 槥, 萎, 女, 恄, 夬, 蠐, 劇, 䴹, 徥, 典, 耇, 點, 緒, 撘, 介, 邠, 㠊, 澈, 膍, 向, 疕, 稟, 禂, 箑, 銛, 蠩, 緇, 凰, 答, 挽, 滈, 晏, 匜, 恞, 駒, 愩, 鼤, 稗, 瞿, 樞, 盩, 暾, 扮, 皆, 竟, 徠, 鬹, 晶, 踆, 軨, 暕, 橖, 掉, 涅, 惋, 秝, 㩇, 掬, 蘦, 䪴, 憢, 鄅, 慄, 檼, 斣, 筮, 麕, 炓, 病, 誨, 倌, 輬, 廚, 伸, 微, 瘁, 羴, 鯧, 橢, 欯, 咫, 俅, 垔, 硜, 訏, 蘇, 䴭, 緝, 璀, 鏖, 軷, 輟, 肯, 湨, 漘, 疸, 齗, 栽, 籠, 鷗, 腬, 愺, 岟, 霍, 刪, 釦, 珦, 㶁, 櫻, 斒, 侙, 䡩, 眺, 璽, 桷, 箅, 僖, 枼, 鐫, 歋, 訟, 猈, 棘, 繄, 玈, 鶟, 齵, 䭢, 鯉, 螋, 房, 閠, 䵒, 蚖, 嬥, 劊, 鶡,

In [8]:
# 1. Chapital letter to small letter
df2['Mandarin'] = df2['Mandarin'].str.lower()

In [9]:
# 2. Delete after slash '/'
def delete_after_slash(df):
    def delete_after_slash_util(row):
        return row.split('/')[0]
    return df.applymap(delete_after_slash_util)

df2 = delete_after_slash(df2)

In [10]:
cc = list(df2['Character'])

In [11]:
# 3. Normalize the tone representation
# a. Cantonese
tone_map = dict()
tone_map['Middle Chinese'] = {u"\u00b9": '1', u"\u00b2": '2', u"\u00b3": '3', u"\u2074": '4'}
def tone_converter(df, column, tone_map):
    tone_map_col = tone_map[column]
    for i in range(len(df[column])):
        row = df[column][i]
        for key in list(tone_map_col.keys()):
            if key in row:
                print(df[column][i], end=' -> ')
                row_list = list(df[column][i])
                row_list[row.index(key)] = tone_map_col[key]
                temp = "".join(row_list)
                temp = ''.join([i for i in temp if not i.isdigit()]) + ''.join([i for i in temp if i.isdigit()])
                df[column][i] = temp
                print(df[column][i])
    return df

df2 = tone_converter(df2, 'Middle Chinese', tone_map)

khwen² -> khwen2
mæ² -> mæ2
sej¹ -> sej1
tshyæ¹ -> tshyæ1
kjo¹ -> kjo1
yuw² -> yuw2
ngjwot⁴ -> ngjwot4
ʔoj³ -> ʔoj3
kɛn¹ -> kɛn1
kɛn³ -> kɛn3
tsheng¹ -> tsheng1
tshyek⁴ -> tshyek4
tshjuw¹ -> tshjuw1
trjew¹ -> trjew1
drjew¹ -> drjew1
ngje² -> ngje2
kuw² -> kuw2
nyin¹ -> nyin1
sywij² -> sywij2
zyæ¹ -> zyæ1
ye¹ -> ye1
mej² -> mej2
leng¹ -> leng1
len¹ -> len1
xan³ -> xan3
dzrjang¹ -> dzrjang1
mwa¹ -> mwa1
dzyieng¹ -> dzyieng1
buwk⁴ -> buwk4
kjun¹ -> kjun1
kim¹ -> kim1
ʔit⁴ -> ʔit4
swon² -> swon2
khuwng¹ -> khuwng1
khuwng³ -> khuwng3
tej² -> tej2
zjang² -> zjang2
dzong¹ -> dzong1
dzri² -> dzri2
tshen¹ -> tshen1
pæk⁴ -> pæk4
dzyip⁴ -> dzyip4
ʔik⁴ -> ʔik4
kwon² -> kwon2
trju² -> trju2
phet⁴ -> phet4
ʔit⁴ -> ʔit4
gjwot⁴ -> gjwot4
nyij³ -> nyij3
nyip⁴ -> nyip4
pɛt⁴ -> pɛt4
mek⁴ -> mek4
ping¹ -> ping1
teng¹ -> teng1
trɛng¹ -> trɛng1
tshwon¹ -> tshwon1
mjun¹ -> mjun1
kek⁴ -> kek4
kɛj³ -> kɛj3
tsyang¹ -> tsyang1
pen¹ -> pen1
zyin¹ -> zyin1
sam¹ -> sam1
sam³ -> sam3
sij³ -> sij3
ngu² -> ngu2
ljuwk⁴

In [12]:
# b. Mandarin
tone_map['Mandarin'] = {
    '\u0101': 'a1',
    '\u00e1': 'a2',
    '\u01ce': 'a3',
    '\u00e0': 'a4',
    '\u014d': 'o1',
    '\u00f3': 'o2',
    '\u01d2': 'o3',
    '\u00f2': 'o4',
    '\u0113': 'e1',
    '\u00e9': 'e2',
    '\u011b': 'e3',
    '\u00e8': 'e4',
    '\u012b': 'i1',
    '\u00ed': 'i2',
    '\u01d0': 'i3',
    '\u00ec': 'i4',
    '\u016b': 'u1',
    '\u00fa': 'u2',
    '\u01d4': 'u3',
    '\u00f9': 'u4',
    '\u01d6': 'ü1',
    '\u01d8': 'ü2',
    '\u01da': 'ü3',
    '\u01dc': 'ü4'
}
print(tone_map['Mandarin'].keys())

dict_keys(['ā', 'á', 'ǎ', 'à', 'ō', 'ó', 'ǒ', 'ò', 'ē', 'é', 'ě', 'è', 'ī', 'í', 'ǐ', 'ì', 'ū', 'ú', 'ǔ', 'ù', 'ǖ', 'ǘ', 'ǚ', 'ǜ'])


In [13]:
df2 = tone_converter(df2, 'Mandarin', tone_map)

quǎn -> quan3
mǎ -> ma3
xī -> xi1
chē -> che1
jū -> ju1
yǒu -> you3
yuè -> yue4
ài -> ai4
jiān -> jian1
jiàn -> jian4
qīng -> qing1
chì -> chi4
qiū -> qiu1
zhāo -> zhao1
cháo -> chao2
yǐ -> yi3
gǒu -> gou3
rén -> ren2
shuǐ -> shui3
shé -> she2
yí -> yi2
mǐ -> mi3
líng -> ling2
lián -> lian2
hàn -> han4
chuáng -> chuang2
mó -> mo2
chéng -> cheng2
pú -> pu2
jūn -> jun1
jīn -> jin1
yī -> yi1
sǔn -> sun3
kōng -> kong1
kòng -> kong4
dǐ -> di3
xiàng -> xiang4
céng -> ceng2
shì -> shi4
qiān -> qian1
bǎi -> bai3
shí -> shi2
yì -> yi4
gǔn -> gun3
zhǔ -> zhu3
piě -> pie3
yǐ -> yi3
jué -> jue2
èr -> er4
rù -> ru4
bā -> ba1
jiōng -> jiong1
bīng -> bing1
dīng -> ding1
zhēng -> zheng1
cūn -> cun1
wén -> wen2
jī -> ji1
jiè -> jie4
zhāng -> zhang1
biān -> bian1
shén -> shen2
sān -> san1
sān -> san1
sì -> si4
wǔ -> wu3
liù -> liu4
qī -> qi1
jiǔ -> jiu3
mò -> mo4
zuì -> zui4
jìn -> jin4
liè -> lie4
zào -> zao4
zào -> zao4
dāo -> dao1
wàn -> wan4
hé -> he2
hè -> he4
xīn -> xin1
rì -> ri4
bái -> bai2
nián

In [14]:
print_all_chars(df2)

Character: 酊, 燃, 仞, 狨, 斛, 聱, 鰇, 抔, 抏, 苨, 羔, 䴳, 唾, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 㔌, 洴, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 㶟, 孩, 榍, 分, 澭, 淃, 睥, 䬾, 詵, 抨, 㑗, 桄, 嗗, 䢼, 拸, 玷, 攽, 蝴, 畛, 糗, 旋, 燎, 瓣, 八, 致, 蔤, 欓, 瓬, 䯬, 紊, 寫, 嚌, 諑, 乖, 付, 剝, 陶, 嬈, 㔂, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 獬, 葎, 䴑, 鳿, 翁, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 悆, 韭, 㒔, 佊, 帛, 躋, 㐆, 遻, 褵, 弛, 灝, 䲯, 廇, 厹, 散, 複, 齾, 檎, 渶, 煠, 巘, 勘, 鯪, 䯊, 䱜, 面, 𦜖, 紵, 㣈, 酡, 冰, 褋, 瑣, 瞭, 屐, 訾, 虩, 孈, 腰, 庨, 彽, 狖, 興, 洊, 衹, 崦, 敔, 庡, 棌, 岈, 鞏, 笅, 嗼, 侶, 婕, 閥, 膙, 杸, 三, 醪, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 癭, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 恚, 霧, 槥, 萎, 女, 恄, 夬, 蠐, 劇, 䴹, 徥, 典, 耇, 點, 緒, 撘, 介, 邠, 㠊, 澈, 膍, 向, 疕, 稟, 禂, 箑, 銛, 蠩, 緇, 凰, 答, 挽, 滈, 晏, 匜, 恞, 駒, 愩, 鼤, 稗, 瞿, 樞, 盩, 暾, 扮, 皆, 竟, 徠, 鬹, 晶, 踆, 軨, 暕, 橖, 掉, 涅, 惋, 秝, 㩇, 掬, 蘦, 䪴, 憢, 鄅, 慄, 檼, 斣, 筮, 麕, 炓, 病, 誨, 倌, 輬, 廚, 伸, 微, 瘁, 羴, 鯧, 橢, 欯, 咫, 俅, 垔, 硜, 訏, 蘇, 䴭, 緝, 璀, 鏖, 軷, 輟, 肯, 湨, 漘, 疸, 齗, 栽, 籠, 鷗, 腬, 愺, 岟, 霍, 刪, 釦, 珦, 㶁, 櫻, 斒, 侙, 䡩, 眺, 璽, 桷, 箅, 僖, 枼, 鐫, 歋, 訟, 猈, 棘, 繄, 玈, 鶟, 齵, 䭢, 鯉, 螋, 房, 閠, 䵒, 蚖, 嬥, 劊, 鶡,

In [15]:
def locate_rows_with_character(df, column, character):
    matching_rows = df[df[column].str.contains(character, na=False)]
    
    if not matching_rows.empty:
        print(f"Rows in column '{column}' containing '{character}':")
        print(matching_rows)
        print()

In [16]:
locate_rows_with_character(df2, 'Mandarin', '㣇')

Rows in column 'Mandarin' containing '㣇':
      Character Middle Chinese Cantonese Mandarin
11233         㣇           yij3      dai6        㣇



In [17]:
df2['Mandarin'][11233] = 'yi4'
df2.iloc[11233]

Character            㣇
Middle Chinese    yij3
Cantonese         dai6
Mandarin           yi4
Name: 11233, dtype: object

In [18]:
print_all_chars(df2)

Character: 酊, 燃, 仞, 狨, 斛, 聱, 鰇, 抔, 抏, 苨, 羔, 䴳, 唾, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 㔌, 洴, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 㶟, 孩, 榍, 分, 澭, 淃, 睥, 䬾, 詵, 抨, 㑗, 桄, 嗗, 䢼, 拸, 玷, 攽, 蝴, 畛, 糗, 旋, 燎, 瓣, 八, 致, 蔤, 欓, 瓬, 䯬, 紊, 寫, 嚌, 諑, 乖, 付, 剝, 陶, 嬈, 㔂, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 獬, 葎, 䴑, 鳿, 翁, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 悆, 韭, 㒔, 佊, 帛, 躋, 㐆, 遻, 褵, 弛, 灝, 䲯, 廇, 厹, 散, 複, 齾, 檎, 渶, 煠, 巘, 勘, 鯪, 䯊, 䱜, 面, 𦜖, 紵, 㣈, 酡, 冰, 褋, 瑣, 瞭, 屐, 訾, 虩, 孈, 腰, 庨, 彽, 狖, 興, 洊, 衹, 崦, 敔, 庡, 棌, 岈, 鞏, 笅, 嗼, 侶, 婕, 閥, 膙, 杸, 三, 醪, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 癭, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 恚, 霧, 槥, 萎, 女, 恄, 夬, 蠐, 劇, 䴹, 徥, 典, 耇, 點, 緒, 撘, 介, 邠, 㠊, 澈, 膍, 向, 疕, 稟, 禂, 箑, 銛, 蠩, 緇, 凰, 答, 挽, 滈, 晏, 匜, 恞, 駒, 愩, 鼤, 稗, 瞿, 樞, 盩, 暾, 扮, 皆, 竟, 徠, 鬹, 晶, 踆, 軨, 暕, 橖, 掉, 涅, 惋, 秝, 㩇, 掬, 蘦, 䪴, 憢, 鄅, 慄, 檼, 斣, 筮, 麕, 炓, 病, 誨, 倌, 輬, 廚, 伸, 微, 瘁, 羴, 鯧, 橢, 欯, 咫, 俅, 垔, 硜, 訏, 蘇, 䴭, 緝, 璀, 鏖, 軷, 輟, 肯, 湨, 漘, 疸, 齗, 栽, 籠, 鷗, 腬, 愺, 岟, 霍, 刪, 釦, 珦, 㶁, 櫻, 斒, 侙, 䡩, 眺, 璽, 桷, 箅, 僖, 枼, 鐫, 歋, 訟, 猈, 棘, 繄, 玈, 鶟, 齵, 䭢, 鯉, 螋, 房, 閠, 䵒, 蚖, 嬥, 劊, 鶡,

In [19]:
df2.to_csv('./dataset/ltc_yue_cmn.csv', index=False)

### If we have only Cantonese input

In [20]:
df_yue = raw_df[['Character', 'Cantonese', 
          'Mandarin']]
df_yue = drop_missing_raws(df_yue)
df_yue

Unnamed: 0,Character,Cantonese,Mandarin
0,犬,hyun2,quǎn
1,馬,maa5,mǎ
2,西,sai1,xī
3,車,ce1/geoi1,chē
4,車,geoi1,jū
...,...,...,...
16234,𩥅,tou1,tāo
16235,𩥉,gei3,jì
16236,𪆴,kwai4,kuí
16237,𪍑,ciu2,chǎo


In [21]:
# Cleanse Mandarin
df_yue['Mandarin'] = df_yue['Mandarin'].str.lower()
df_yue = delete_after_slash(df_yue)

df_yue = tone_converter(df_yue, 'Mandarin', tone_map)
df_yue

quǎn -> quan3
mǎ -> ma3
xī -> xi1
chē -> che1
jū -> ju1
yǒu -> you3
yuè -> yue4
ài -> ai4
jiān -> jian1
jiàn -> jian4
qīng -> qing1
jīng -> jing1
chì -> chi4
qiū -> qiu1
zhāo -> zhao1
cháo -> chao2
yǐ -> yi3
gǒu -> gou3
rén -> ren2
shuǐ -> shui3
shé -> she2
yí -> yi2
mǐ -> mi3
mǐ -> mi3
líng -> ling2
lián -> lian2
hàn -> han4
chuáng -> chuang2
shuǐ -> shui3
mó -> mo2
chéng -> cheng2
pú -> pu2
jūn -> jun1
jīn -> jin1
yī -> yi1
sǔn -> sun3
kōng -> kong1
kòng -> kong4
dǐ -> di3
xiàng -> xiang4
céng -> ceng2
shì -> shi4
qiān -> qian1
bǎi -> bai3
shí -> shi2
yì -> yi4
gǔn -> gun3
zhǔ -> zhu3
piě -> pie3
yǐ -> yi3
jué -> jue2
èr -> er4
tóu -> tou2
rù -> ru4
bā -> ba1
jiōng -> jiong1
yún -> yun2
bīng -> bing1
dīng -> ding1
zhēng -> zheng1
cūn -> cun1
wén -> wen2
jī -> ji1
jiè -> jie4
zhāng -> zhang1
biān -> bian1
shén -> shen2
shēn -> shen1
sān -> san1
sān -> san1
sì -> si4
wǔ -> wu3
liù -> liu4
qī -> qi1
jiǔ -> jiu3
mò -> mo4
zuì -> zui4
jìn -> jin4
liè -> lie4
zào -> zao4
zào -> zao4
zào ->

Unnamed: 0,Character,Cantonese,Mandarin
0,犬,hyun2,quan3
1,馬,maa5,ma3
2,西,sai1,xi1
3,車,ce1,che1
4,車,geoi1,ju1
...,...,...,...
16234,𩥅,tou1,tao1
16235,𩥉,gei3,ji4
16236,𪆴,kwai4,kui2
16237,𪍑,ciu2,chao3


In [22]:
print_all_chars(df_yue)

Character: 酊, 燃, 仞, 狨, 斛, 贌, 聱, 鰇, 抔, 抏, 苨, 吩, 羔, 䴳, 唾, 䵠, 㮝, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 錂, 𡃉, 䮑, 㔌, 洴, 䴪, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 牬, 㶟, 䭞, 孩, 榍, 分, 澭, 淃, 睥, 誂, 䬾, 詵, 抨, 㺸, 㑗, 桄, 虵, 嗗, 䢼, 拸, 玷, 攽, 蝴, 㸱, 畛, 糗, 旋, 燎, 瓣, 㓵, 八, 稛, 致, 蔤, 欓, 瓬, 宱, 䯬, 圳, 紊, 勆, 寫, 碘, 䢗, 嚌, 妔, 諑, 乖, 付, 剝, 陶, 飦, 嬈, 㔂, 𩹎, 丟, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 怑, 姯, 獬, 葎, 䴑, 鳿, 䯭, 尢, 翁, 㗊, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 饃, 䫚, 圾, 悆, 韭, 籅, 㒔, 佊, 帛, 躋, 㐆, 遻, 昺, 潳, 銥, 褵, 弛, 灝, 䲯, 䱗, 颶, 媡, 廇, 㟻, 厹, 散, 複, 齾, 㙛, 檎, 䁏, 渶, 漇, 煠, 熉, 巘, 勘, 鯪, 堍, 䯊, 面, 䱜, 紵, 𦜖, 㣈, 槬, 酡, 䭫, 冰, 俀, 褋, 瑣, 瞭, 屐, 訾, 虩, 婊, 孈, 腰, 庨, 瀟, 䰔, 彽, 妟, 狖, 卾, 洊, 興, 衹, 崦, 銪, 枟, 敔, 庡, 棌, 賬, 岈, 鞏, 笅, 嗼, 侶, 婕, 灆, 膙, 閥, 䶟, 猁, 杸, 三, 醪, 埥, 幀, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 緳, 癭, 抋, 鋬, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 彍, 恚, 浗, 霧, 樨, 爏, 䶍, 槥, 煂, 萎, 𠥍, 女, 恄, 鉾, 夬, 蠐, 夦, 屗, 㗂, 劇, 䴹, 欑, 徥, 典, 耇, 撐, 緒, 酶, 撘, 點, 介, 邠, 錨, 㠊, 嬵, 噚, 澈, 膍, 㾮, 向, 疕, 稟, 禂, 箑, 㞨, 銛, 蠩, 緇, 凰, 奙, 答, 挽, 榊, 滈, 碟, 晏, 㬈, 匜, 恞, 駒, 愩, 鼤, 𩷕, 稗, 瞿, 樞, 盩, 暾, 扮, 汌, 皆, 竟, 徠, 鬹,

In [23]:
outlier_list = ['ḿ', 'ề', 'ǹ','\u0300', '䴉', '㣇', 'ń']
for outl in outlier_list:
    locate_rows_with_character(df_yue, 'Mandarin', outl)

Rows in column 'Mandarin' containing 'ḿ':
     Character Cantonese Mandarin
1545         呣        m2        ḿ
2005         嘸        m4        ḿ

Rows in column 'Mandarin' containing 'ề':
      Character Cantonese Mandarin
6139          欸       ei6        ề
11457         誒        e6        ề

Rows in column 'Mandarin' containing 'ǹ':
     Character Cantonese Mandarin
1929         嗯       ng6       ǹg

Rows in column 'Mandarin' containing '̀':
     Character Cantonese Mandarin
1546         呣        m6       m̀

Rows in column 'Mandarin' containing '䴉':
      Character Cantonese Mandarin
16046         䴉     waan4        䴉

Rows in column 'Mandarin' containing '㣇':
      Character Cantonese Mandarin
15136         㣇      dai6        㣇

Rows in column 'Mandarin' containing 'ń':
     Character Cantonese Mandarin
1719         唔        m4       ńg
1928         嗯       ng2       ńg



In [24]:
df_yue = df_yue.drop([1545, 2005, 6139, 11457, 1929, 1546, 1719, 1928])
df_yue['Mandarin'][15136] = 'yi4'
df_yue['Mandarin'][16046] = 'huan2'
print_all_chars(df_yue)

Character: 酊, 燃, 仞, 狨, 斛, 贌, 聱, 鰇, 抔, 抏, 苨, 吩, 羔, 䴳, 唾, 䵠, 㮝, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 錂, 𡃉, 䮑, 㔌, 洴, 䴪, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 牬, 㶟, 䭞, 孩, 榍, 分, 澭, 淃, 睥, 誂, 䬾, 詵, 抨, 㺸, 㑗, 桄, 虵, 嗗, 䢼, 拸, 玷, 攽, 蝴, 㸱, 畛, 糗, 旋, 燎, 瓣, 㓵, 八, 稛, 致, 蔤, 欓, 瓬, 宱, 䯬, 圳, 紊, 勆, 寫, 碘, 䢗, 嚌, 妔, 諑, 乖, 付, 剝, 陶, 飦, 嬈, 㔂, 𩹎, 丟, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 怑, 姯, 獬, 葎, 䴑, 鳿, 䯭, 尢, 翁, 㗊, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 饃, 䫚, 圾, 悆, 韭, 籅, 㒔, 佊, 帛, 躋, 㐆, 遻, 昺, 潳, 銥, 褵, 弛, 灝, 䲯, 䱗, 颶, 媡, 廇, 㟻, 厹, 散, 複, 齾, 㙛, 檎, 䁏, 渶, 漇, 煠, 熉, 巘, 勘, 鯪, 堍, 䯊, 面, 䱜, 紵, 𦜖, 㣈, 槬, 酡, 䭫, 冰, 俀, 褋, 瑣, 瞭, 屐, 訾, 虩, 婊, 孈, 腰, 庨, 瀟, 䰔, 彽, 妟, 狖, 卾, 洊, 興, 衹, 崦, 銪, 枟, 敔, 庡, 棌, 賬, 岈, 鞏, 笅, 嗼, 侶, 婕, 灆, 膙, 閥, 䶟, 猁, 杸, 三, 醪, 埥, 幀, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 緳, 癭, 抋, 鋬, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 彍, 恚, 浗, 霧, 樨, 爏, 䶍, 槥, 煂, 萎, 𠥍, 女, 恄, 鉾, 夬, 蠐, 夦, 屗, 㗂, 劇, 䴹, 欑, 徥, 典, 耇, 撐, 緒, 酶, 撘, 點, 介, 邠, 錨, 㠊, 嬵, 噚, 澈, 膍, 㾮, 向, 疕, 稟, 禂, 箑, 㞨, 銛, 蠩, 緇, 凰, 奙, 答, 挽, 榊, 滈, 碟, 晏, 㬈, 匜, 恞, 駒, 愩, 鼤, 𩷕, 稗, 瞿, 樞, 盩, 暾, 扮, 汌, 皆, 竟, 徠, 鬹,

In [25]:
# Cleanse Cantonese
locate_rows_with_character(df_yue, 'Cantonese', ' ')

Rows in column 'Cantonese' containing ' ':
     Character   Cantonese  Mandarin
967          兙   sap6 hak1   shike24
969          兛   cin1 hak1  qianke41
971          兝   fan1 hak1   fenke41
972          兞   hou4 hak1   maoke42
974          兡  baak3 hak1   baike43
976          兣   lei4 hak1  gongli31
1923         嗧  gaa1 leon4  jialun21
8065         瓩  cin1 ngaa5  qianwa31



In [26]:
df_yue = df_yue.drop([967, 969, 971, 972, 974, 976, 1923, 8065])
df_yue = df_yue.reset_index(drop=True)
print_all_chars(df_yue)

Character: 酊, 燃, 仞, 狨, 斛, 贌, 聱, 鰇, 抔, 抏, 苨, 吩, 羔, 䴳, 唾, 䵠, 㮝, 場, 軺, 鰥, 㼗, 㝅, 輹, 蓏, 傃, 烏, 濬, 熊, 鴻, 錂, 𡃉, 䮑, 㔌, 洴, 䴪, 顑, 叫, 甶, 柄, 稑, 頲, 薌, 統, 茷, 鞎, 刢, 䠯, 襁, 虍, 牬, 㶟, 䭞, 孩, 榍, 分, 澭, 淃, 睥, 誂, 䬾, 詵, 抨, 㺸, 㑗, 桄, 虵, 嗗, 䢼, 拸, 玷, 攽, 蝴, 㸱, 畛, 糗, 旋, 燎, 瓣, 㓵, 八, 稛, 致, 蔤, 欓, 瓬, 宱, 䯬, 圳, 紊, 勆, 寫, 碘, 䢗, 嚌, 妔, 諑, 乖, 付, 剝, 陶, 飦, 嬈, 㔂, 𩹎, 丟, 㒝, 邗, 䬬, 琨, 夠, 加, 篌, 蠌, 虜, 髼, 涒, 怑, 姯, 獬, 葎, 䴑, 鳿, 䯭, 尢, 翁, 㗊, 嫻, 奇, 䟙, 洛, 池, 炕, 楀, 戈, 莝, 胯, 喙, 淩, 觡, 鉞, 饃, 䫚, 圾, 悆, 韭, 籅, 㒔, 佊, 帛, 躋, 㐆, 遻, 昺, 潳, 銥, 褵, 弛, 灝, 䲯, 䱗, 颶, 媡, 廇, 㟻, 厹, 散, 複, 齾, 㙛, 檎, 䁏, 渶, 漇, 煠, 熉, 巘, 勘, 鯪, 堍, 䯊, 面, 䱜, 紵, 𦜖, 㣈, 槬, 酡, 䭫, 冰, 俀, 褋, 瑣, 瞭, 屐, 訾, 虩, 婊, 孈, 腰, 庨, 瀟, 䰔, 彽, 妟, 狖, 卾, 洊, 興, 衹, 崦, 銪, 枟, 敔, 庡, 棌, 賬, 岈, 鞏, 笅, 嗼, 侶, 婕, 灆, 膙, 閥, 䶟, 猁, 杸, 三, 醪, 埥, 幀, 沁, 戇, 藟, 偽, 堪, 磋, 㒸, 貐, 贈, 赴, 㟍, 緳, 癭, 抋, 鋬, 摘, 佩, 拾, 葟, 鸑, 娖, 勰, 濞, 枉, 彍, 恚, 浗, 霧, 樨, 爏, 䶍, 槥, 煂, 萎, 𠥍, 女, 恄, 鉾, 夬, 蠐, 夦, 屗, 㗂, 劇, 䴹, 欑, 徥, 典, 耇, 撐, 緒, 酶, 撘, 點, 介, 邠, 錨, 㠊, 嬵, 噚, 澈, 膍, 㾮, 向, 疕, 稟, 禂, 箑, 㞨, 銛, 蠩, 緇, 凰, 奙, 答, 挽, 榊, 滈, 碟, 晏, 㬈, 匜, 恞, 駒, 愩, 鼤, 𩷕, 稗, 瞿, 樞, 盩, 暾, 扮, 汌, 皆, 竟, 徠, 鬹,

In [27]:
df_yue.to_csv('./dataset/yue_cmn.csv', index=False)