In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Data import & cleanse

In [None]:
raw_df = pd.read_csv('dataset/wikihan-romanization.tsv', sep='\t')

In [None]:
raw_df

In [None]:
df2 = raw_df[['Character', 'Middle Chinese (Baxter and Sagart 2014)', 'Cantonese', 
          'Mandarin']]
df2 = df2.rename(columns={'Middle Chinese (Baxter and Sagart 2014)': 'Middle Chinese'})
df2

In [None]:
# Delete rows that are not fully recorded
def drop_missing_raws(df):
    df = df.replace('-', pd.NA)
    row_missing_values = df.isnull().sum(axis=1)
    fully_filled_rows = row_missing_values[row_missing_values == 0]
    df = df.iloc[fully_filled_rows.index]
    df = df.reset_index(drop=True)
    return df

df2 = drop_missing_raws(df2)
df2

In [None]:
# Normalize the table representation
df2['Middle Chinese'][1]

In [None]:
def get_unique_characters(df):
    unique_characters = {}

    for column in df.columns:
        combined_values = ''.join(df[column].astype(str))
        unique_chars = set(combined_values)
        unique_characters[column] = unique_chars

    return unique_characters

def print_all_chars(df):
    for column, unique_chars in get_unique_characters(df).items():
        print(f'{column}: {", ".join(unique_chars)}')

print(print_all_chars(df2))

In [None]:
# 1. Chapital letter to small letter
df2['Mandarin'] = df2['Mandarin'].str.lower()

In [None]:
# 2. Delete after slash '/'
def delete_after_slash(df):
    def delete_after_slash_util(row):
        return row.split('/')[0]
    return df.applymap(delete_after_slash_util)

df2 = delete_after_slash(df2)

In [None]:
cc = list(df2['Character'])

In [None]:
# 3. Normalize the tone representation
# a. Cantonese
tone_map = dict()
tone_map['Middle Chinese'] = {u"\u00b9": '1', u"\u00b2": '2', u"\u00b3": '3', u"\u2074": '4'}
def tone_converter(df, column, tone_map):
    tone_map_col = tone_map[column]
    for i in range(len(df[column])):
        row = df[column][i]
        for key in list(tone_map_col.keys()):
            if key in row:
                print(df[column][i], end=' -> ')
                row_list = list(df[column][i])
                row_list[row.index(key)] = tone_map_col[key]
                temp = "".join(row_list)
                temp = ''.join([i for i in temp if not i.isdigit()]) + ''.join([i for i in temp if i.isdigit()])
                df[column][i] = temp
                print(df[column][i])
    return df

df2 = tone_converter(df2, 'Middle Chinese', tone_map)

In [None]:
# b. Mandarin
tone_map['Mandarin'] = {
    '\u0101': 'a1',
    '\u00e1': 'a2',
    '\u01ce': 'a3',
    '\u00e0': 'a4',
    '\u014d': 'o1',
    '\u00f3': 'o2',
    '\u01d2': 'o3',
    '\u00f2': 'o4',
    '\u0113': 'e1',
    '\u00e9': 'e2',
    '\u011b': 'e3',
    '\u00e8': 'e4',
    '\u012b': 'i1',
    '\u00ed': 'i2',
    '\u01d0': 'i3',
    '\u00ec': 'i4',
    '\u016b': 'u1',
    '\u00fa': 'u2',
    '\u01d4': 'u3',
    '\u00f9': 'u4',
    '\u01d6': 'ü1',
    '\u01d8': 'ü2',
    '\u01da': 'ü3',
    '\u01dc': 'ü4'
}
print(tone_map['Mandarin'].keys())

In [None]:
df2 = tone_converter(df2, 'Mandarin', tone_map)

In [None]:
print_all_chars(df2)

In [None]:
def locate_rows_with_character(df, column, character):
    matching_rows = df[df[column].str.contains(character, na=False)]
    
    if not matching_rows.empty:
        print(f"Rows in column '{column}' containing '{character}':")
        print(matching_rows)
        print()

In [None]:
locate_rows_with_character(df2, 'Mandarin', '㣇')

In [None]:
df2['Mandarin'][11233] = 'yi4'
df2.iloc[11233]

In [None]:
print_all_chars(df2)

In [None]:
df2.to_csv('./dataset/ltc_yue_cmn.csv', index=False)

### If we have only Cantonese input

In [None]:
df_yue = raw_df[['Character', 'Cantonese', 
          'Mandarin']]
df_yue = drop_missing_raws(df_yue)
df_yue

In [None]:
# Cleanse Mandarin
df_yue['Mandarin'] = df_yue['Mandarin'].str.lower()
df_yue = delete_after_slash(df_yue)

df_yue = tone_converter(df_yue, 'Mandarin', tone_map)
df_yue

In [None]:
print_all_chars(df_yue)

In [None]:
outlier_list = ['ḿ', 'ề', 'ǹ','\u0300', '䴉', '㣇', 'ń']
for outl in outlier_list:
    locate_rows_with_character(df_yue, 'Mandarin', outl)

In [None]:
df_yue = df_yue.drop([1545, 2005, 6139, 11457, 1929, 1546, 1719, 1928])
df_yue['Mandarin'][15136] = 'yi4'
df_yue['Mandarin'][16046] = 'huan2'
print_all_chars(df_yue)

In [None]:
# Cleanse Cantonese
locate_rows_with_character(df_yue, 'Cantonese', ' ')

In [None]:
df_yue = df_yue.drop([967, 969, 971, 972, 974, 976, 1923, 8065])
df_yue = df_yue.reset_index(drop=True)
print_all_chars(df_yue)

In [None]:
df_yue.to_csv('./dataset/yue_cmn.csv', index=False)