In [52]:
import pandas as pd
import pickle

from pathlib2 import Path


In [53]:
labels_dir = Path('labels')

## Clean visual features

In [65]:
visual_labels_df = pd.read_csv(labels_dir / 'visual_labels.csv', sep='\t')
visual_labels_df.head()

Unnamed: 0.1,Unnamed: 0,ID,汉字,常用,结构
0,0,1,一,1,单一
1,1,2,乙,1,单一
2,2,3,二,1,单一
3,3,4,十,1,单一
4,4,5,丁,1,单一


In [66]:
visual_labels_df = visual_labels_df[visual_labels_df['常用'] == 1]

In [67]:
visual_labels_df['结构'].unique()

array(['单一', '上下', '左右', '左上包', '半包', '上三包', '下三包', '右上包', '全包', '左下包',
       '上中下', '品字', '左中右', '镶嵌'], dtype=object)

In [68]:
mapping_dict = {
    '单一': 'Unicomponent',
    '上下': 'Vertical juxtaposition',
    '左右': 'Horizontal juxtaposition',
    '左上包': 'Upper-left enclosing',
    '半包': 'Semi-enclosing',
    '上三包': 'Three-part upper enclosing',
    '下三包': 'Three-part lower enclosing',
    '右上包': 'Upper-right enclosing',
    '全包': 'Complete enclosing',
    '左下包': 'Lower-left enclosing',
    '镶嵌': 'Embedded',
    '上中下': 'Vertical trisection',
    '品字': 'Trigram',
    '左中右': 'Horizontal trisection',
}

In [69]:
visual_labels_df['结构'] = visual_labels_df['结构'].map(lambda x: mapping_dict[x] if x in mapping_dict else 'Other')

In [70]:
visual_labels_df.rename(columns={'结构': 'Structure', '汉字': 'Hanzi'}, inplace=True)

In [71]:
index_dict = {
    structure: i for i, structure in enumerate(visual_labels_df['Structure'].unique())
}
save_index_dict = {
    i: structure for i, structure in enumerate(visual_labels_df['Structure'].unique())
}
pickle.dump(save_index_dict, open(labels_dir / 'visual_labels_index_dict.pkl', 'wb'))

In [72]:
visual_labels_df['label_index'] = visual_labels_df['Structure'].map(lambda x: index_dict[x])

In [73]:
visual_labels_df.drop(columns=['Unnamed: 0', 'ID', '常用'], inplace=True)

In [74]:
visual_labels_df.to_csv(labels_dir / 'visual_labels_cleaned.csv', index=False)