In [2]:
import pandas as pd
import pickle
import re

from pathlib2 import Path


In [3]:
labels_dir = Path('labels')

In [14]:
def filter_chinese_characters(char_list):
    chinese_character_pattern = r'[\u4e00-\u9fff]'
    return [
        char for char in char_list if re.match(chinese_character_pattern, char)
    ]


## Clean visual features

In [4]:
visual_labels_df = pd.read_csv(labels_dir / 'visual_labels.csv', sep='\t')
visual_labels_df.head()

Unnamed: 0.1,Unnamed: 0,ID,汉字,常用,结构
0,0,1,一,1,单一
1,1,2,乙,1,单一
2,2,3,二,1,单一
3,3,4,十,1,单一
4,4,5,丁,1,单一


In [5]:
visual_labels_df = visual_labels_df[visual_labels_df['常用'] == 1]

In [6]:
visual_labels_df['结构'].unique()

array(['单一', '上下', '左右', '左上包', '半包', '上三包', '下三包', '右上包', '全包', '左下包',
       '上中下', '品字', '左中右', '镶嵌'], dtype=object)

In [68]:
mapping_dict = {
    '单一': 'Unicomponent',
    '上下': 'Vertical juxtaposition',
    '左右': 'Horizontal juxtaposition',
    '左上包': 'Upper-left enclosing',
    '半包': 'Semi-enclosing',
    '上三包': 'Three-part upper enclosing',
    '下三包': 'Three-part lower enclosing',
    '右上包': 'Upper-right enclosing',
    '全包': 'Complete enclosing',
    '左下包': 'Lower-left enclosing',
    '镶嵌': 'Embedded',
    '上中下': 'Vertical trisection',
    '品字': 'Trigram',
    '左中右': 'Horizontal trisection',
}

In [69]:
visual_labels_df['结构'] = visual_labels_df['结构'].map(lambda x: mapping_dict[x] if x in mapping_dict else 'Other')

In [70]:
visual_labels_df.rename(columns={'结构': 'Structure', '汉字': 'Hanzi'}, inplace=True)

In [71]:
index_dict = {
    structure: i for i, structure in enumerate(visual_labels_df['Structure'].unique())
}
save_index_dict = {
    i: structure for i, structure in enumerate(visual_labels_df['Structure'].unique())
}
pickle.dump(save_index_dict, open(labels_dir / 'visual_labels_index_dict.pkl', 'wb'))

In [72]:
visual_labels_df['label_index'] = visual_labels_df['Structure'].map(lambda x: index_dict[x])

In [73]:
visual_labels_df.drop(columns=['Unnamed: 0', 'ID', '常用'], inplace=True)

In [74]:
visual_labels_df.to_csv(labels_dir / 'visual_labels_cleaned.csv', index=False)

## Clean definition labels

In [59]:
def_labels = pd.read_csv(labels_dir / 'definition_labels.csv', sep='\t', index_col=0)

In [60]:
def_labels.head()

Unnamed: 0,汉字,形声,形,声
0,一,0,,
1,丁,0,,
2,七,0,,
3,万,0,,
4,丈,0,,


In [61]:
def_labels.rename(columns={
    '汉字': 'Hanzi',
    '形声': 'pictophonetic',
    '形': 'semantic_component',
    '声': 'phonetic_component'
},
                  inplace=True)


In [62]:
semantic_components = filter_chinese_characters(def_labels['semantic_component'].dropna().unique().tolist())
phonetic_components = filter_chinese_characters(def_labels['phonetic_component'].dropna().unique().tolist())
print('Have {} semantic components'.format(len(semantic_components)))
print('Have {} phonetic components'.format(len(phonetic_components)))

Have 204 semantic components
Have 917 phonetic components


In [63]:
semantic_index_dict = {
    semantic_component : i for i, semantic_component in enumerate(semantic_components)
}
save_semantic_index_dict = {
    i: semantic_component for i, semantic_component in enumerate(semantic_components)
}
phonetic_index_dict = {
    phonetic_components : i for i, phonetic_components in enumerate(phonetic_components)
}
save_phonetic_index_dict = {
    i: phonetic_components for i, phonetic_components in enumerate(phonetic_components)
}
pickle.dump(save_semantic_index_dict, open(labels_dir / 'semantic_labels_index_dict.pkl', 'wb'))
pickle.dump(save_phonetic_index_dict, open(labels_dir / 'phonetic_labels_index_dict.pkl', 'wb'))

In [66]:
semantic_labels = def_labels[def_labels['semantic_component'].isin(semantic_components)].copy()
phonetic_labels = def_labels[def_labels['phonetic_component'].isin(phonetic_components)].copy()

In [67]:
semantic_labels['semantic_index'] = semantic_labels['semantic_component'].map(lambda x: semantic_index_dict[x])
phonetic_labels['phonetic_index'] = phonetic_labels['phonetic_component'].map(lambda x: phonetic_index_dict[x])

In [68]:
semantic_labels[['Hanzi', 'semantic_index']].rename(columns={'semantic_index': 'label_index'}).to_csv(labels_dir / 'semantic_labels_cleaned.csv', index=False)
phonetic_labels[['Hanzi', 'phonetic_index']].rename(columns={'phonetic_index': 'label_index'}).to_csv(labels_dir / 'phonetic_labels_cleaned.csv', index=False)