<pre>
Create pinyin data from Unihan kHanyuPinlu,kHanyuPinyin,kXHC1983 data,
This may not contains all GB18030 characters,but it cover most used.
and beyond this,this implemention given possbility to support Japanese and Korean.
</pre>

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re

In [2]:
import zipfile
archive = zipfile.ZipFile('data/Unihan.zip', 'r')
reading_file = archive.open('Unihan_Readings.txt')

In [3]:
df = pd.read_csv(reading_file, sep='\t', comment='#', names=['char', 'field', 'value'])

In [4]:
def char_parser(s: str) -> str:
    return chr(int(s[-4:], 16))

In [5]:
def kHanyuPinyin_parser(s: str) -> list:
    """
    >>> kHanyuPinyin_parser("10009.010:jī,qí")
    ['jī', 'qí']
    >>> kHanyuPinyin_parser("31641.040:hán,gàn 80023.120:gàn,hán")
    ['hán', 'gàn', 'gàn', 'hán']
    """
    blocks = s.split(' ')
    ret = []
    for b in blocks:
        ret.extend(b.split(':')[1].split(','))
    return ret

In [6]:
def kHanyuPinlu_parser(s: str) -> list:
    """
    >>> kHanyuPinlu_parser("shàng(12308) shang(392)")
    ['shàng', 'shang']
    """
    def keep_pinyin(s):
        """remove digits and ( and )"""
        return re.sub('[0-9\(\)]', '', s)
    return list(map(keep_pinyin, s.split()))

In [7]:
def kXHC1983_parser(s: str) -> list:
    """
    >>> kXHC1983_parser("k0811.021:ň  0826.021:ňg")
    ['ň', 'ňg']
    """
    return [part.split(':')[1] for part in s.split()]

In [8]:
set(df['field'])

{'kCantonese',
 'kDefinition',
 'kHangul',
 'kHanyuPinlu',
 'kHanyuPinyin',
 'kJapaneseKun',
 'kJapaneseOn',
 'kKorean',
 'kMandarin',
 'kTang',
 'kVietnamese',
 'kXHC1983'}

In [9]:
output = {}
for field in set(df['field']):
    parser = field + "_parser"
    if parser in globals().keys():
        output[field] = defaultdict(list)
        field_df = df[df['field'] == field]
        parser_fn = globals()[parser]
        for idx, row in field_df.iterrows():
            char = char_parser(row['char'])
            pinyin = parser_fn(row['value'])
            output[field][char].extend(pinyin)

In [10]:
py_dict = defaultdict(set)
for field in output.keys():
    for char in output[field].keys():
        py_dict[char].update(output[field][char])

In [11]:
import unicodedata
pure_py_dict = {}
def remove_accent(s: str) -> str:
    """
    >>> remove_accent("wā")
    'wa'
    """
    return (
        unicodedata
        .normalize('NFKD', s)
        .encode('ascii','ignore')
        .decode()
    )

for key in py_dict.keys():
    pure_py_dict[key] = list(sorted(set(map(remove_accent, py_dict[key]))))

In [12]:
import os
sep = os.linesep
with open('pyfiledir/py_dict.py','w', encoding="utf-8") as f:
    print("#!/usr/bin/env python",  end=sep, file=f)
    print("# -*- coding: utf-8 -*-",  end=sep, file=f)
    print('"""This file was auto-generated by pyfiledir script."""', end=sep, file=f)
    print("",  end=os.linesep, file=f)
    print("PY_DICT = {", end=sep, file=f)
    for char in pure_py_dict.keys():
        try:
            char.encode("ascii")
        except UnicodeEncodeError:
            line = ' ' * 4 + "{}: {},".format(repr(char), repr(pure_py_dict[char]))
            print(line, end=sep, file=f)
    f.write("}")

In [13]:
from pyfiledir.py_core import GB2312EncodeingRange

In [14]:
uncovered_chars = []
_st_num = int.from_bytes(GB2312EncodeingRange.min_codepoint, byteorder="big")
_ed_num = int.from_bytes(GB2312EncodeingRange.max_codepoint, byteorder="big")
for num in range(_st_num, _ed_num+1):
    char_bytes = num.to_bytes(length=2, byteorder="big")
    try:
        char = char_bytes.decode("GB18030")
        if char not in pure_py_dict.keys():
            uncovered_chars.append(char)
    except UnicodeDecodeError:
        pass
from textwrap import fill
chars_list = [x for x in uncovered_chars ]
print(fill(str(chars_list), width=64) )

['盕', '盙', '盫', '盽', '県', '睓', '瞆', '瞓', '瞾', '矁', '矅', '矋',
'矏', '矒', '矝', '矤', '矦', '砇', '砕', '砙', '砤', '砿', '硂', '硓', '硛',
'硲', '硳', '碈', '碯', '碷', '磇', '磗', '磘', '磮', '磰', '磱', '礀', '礂',
'礈', '礝', '礢', '礶', '礸', '祂', '祅', '祙', '祬', '祶', '禆', '禇', '禉',
'禙', '禣', '禥', '禴', '禵', '秐', '秗', '秮', '秼', '稁', '稇', '稤', '稥',
'稺', '穂', '穌', '穏', '穞', '穯', '窂', '窛', '窧', '窴', '窽', '竂', '竃',
'竆', '竍', '竏', '竐', '竔', '竩', '竰', '竸', '竼', '笟', '笶', '笹', '笻',
'笽', '筂', '筙', '筨', '筺', '筿', '箏', '箚', '篐', '篭', '簓', '簔', '簗',
'簤', '簮', '簼', '籂', '籎', '籕', '籖', '籗', '籭', '籶', '籿', '粁', '粊',
'粎', '粏', '粐', '粖', '粚', '粛', '粠', '粩', '粫', '糓', '糘', '糤', '糥',
'糳', '紦', '絵', '絶', '綂', '綗', '綘', '続', '綛', '綤', '綳', '緓', '緕',
'緖', '緫', '緮', '緽', '縀', '縁', '縃', '縄', '縇', '繍', '繤', '繧', '繱',
'繿', '纄', '纉', '绬', '缷', '缼', '罀', '罙', '羀', '羃', '羪', '翑', '翤',
'翭', '耂', '耉', '耊', '聓', '聜', '聢', '聣', '聦', '聨', '聫', '聮', '聴',
'肔', '肻', '胐', '胑', '胢', '脳', '脵', '腁', '膐', '膓', '膖', '膤', '膥',
'膶', '臋', '臓', '臰', '舃', '舑',

In [15]:
import doctest
doctest.testmod()

TestResults(failed=0, attempted=5)