## Globals

In [1]:
%load_ext autoreload
%autoreload 2

import opencc

from pypinyin import pinyin, lazy_pinyin, Style
import os

import tqdm
from pyscripts.words import *
from copy import deepcopy

cc = opencc.OpenCC("s2t")

In [2]:
import hanlp

HanLP_MTL = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
# HanLP_NER = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)

  from .autonotebook import tqdm as notebook_tqdm
                                   

In [3]:
HanLP_MTL('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。').pretty_print()
# HanLP_NER('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。')
# HanLP_NER(['晓美焰', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。'])

## Character Dict
### Based on `chars.dict.yaml`

In [4]:
# LOAD CHARACTER DICT 

filename = "./dicts/udpn/chars.dict.yaml"

char2scode = {}
line_num_dict = {}

chardict = CharDict()


with open(filename, "r", encoding="UTF-8") as infile:
    for i, line in tqdm.tqdm(enumerate(infile.readlines())):
        if not '[' in line:
            continue
        
        if '#' in line:
            line = line.split('#')[0].strip()
            if len(line) == 0:
                continue
        
        char = cc.convert(line.split()[0])
        pron = line.split()[1][:2]
        code = line.split()[1][3:]

        if char not in line_num_dict:
            line_num_dict[char] = [i + 1]
        else:
            line_num_dict[char].append(i + 1)
        
        if char in char2scode.keys() and char2scode[char] != code:
            if char2scode[char] == "[":
                char2scode[char] = code
            elif code == "[":
                pass
            else:
                print(f"Lines {','.join(str(x) for x in line_num_dict[char])}: {char} has codes {char2scode[char]} and {code}")
            # print(f"Lines {','.join(str(x) for x in line_num_dict[char])}: {char} has codes {char_dict[char]} and {code}")
        else:
            char2scode[char] = code

    char2scode["乾"] = "ur"

    # write full line dict in a 2nd iter, keeping correct codes

with open(filename, "r", encoding="UTF-8") as infile:
    for i, line in tqdm.tqdm(enumerate(infile.readlines())):
        if not '[' in line:
            continue

        if '#' in line:
            line = line.split('#')[0].strip()
            if len(line) == 0:
                continue

        char = cc.convert(line.split()[0])
        pron = line.split()[1][:2]
        weight = int(line.split()[2]) # Load already weighted dict

        
        if char not in chardict:
            chardict[char] = []

        chardict[char].append(CharInfo(pron, char2scode[char], weight))

chardict.SortWeight()


45950it [00:00, 246540.08it/s]


Lines 27050,45949: 鍾 has codes jv and jp


45950it [00:00, 160325.65it/s]


In [5]:

print(*chardict["乾"], sep='\n')
print(cc.convert("烟熏"))


# 乾 has codes hu and ur
# 谘	zi[yd
# 薰 has codes pd and cp
# 锺	vs[jp
# 鲶	nm[yr

qm[ur 67122
gj[ur 0
煙燻


In [7]:
import re

def qp2udpn(char):
    assert(len(char.split()) == 1)

    char = re.sub(r"sh", r"U", char)
    char = re.sub(r"ch", r"I", char)
    char = re.sub(r"zh", r"V", char)
    
    char = re.sub(r"(\w)[iu]ang\b", r"\1D", char)
    char = re.sub(r"(\w)i?ong\b", r"\1S", char)
    char = re.sub(r"(\w)iao\b", r"\1C", char)
    char = re.sub(r"(\w)ian\b", r"\1M", char)
    char = re.sub(r"(\w)eng\b", r"\1G", char)
    char = re.sub(r"(\w)ang\b", r"\1H", char)
    char = re.sub(r"(\w)ing\b", r"\1;", char)
    char = re.sub(r"(\w)uan\b", r"\1R", char)
    char = re.sub(r"(\w)uai\b", r"\1Y", char)
    char = re.sub(r"(\w)iu\b", r"\1Q", char)
    char = re.sub(r"(\w)er\b", r"\1R", char)
    char = re.sub(r"(\w)uo\b", r"\1O", char)
    char = re.sub(r"(\w)un\b", r"\1P", char)
    char = re.sub(r"(\w)en\b", r"\1F", char)
    char = re.sub(r"(\w)an\b", r"\1J", char)
    char = re.sub(r"(\w)ao\b", r"\1K", char)
    char = re.sub(r"(\w)ai\b", r"\1L", char)
    char = re.sub(r"(\w)ei\b", r"\1Z", char)
    char = re.sub(r"(\w)ie\b", r"\1X", char)
    char = re.sub(r"(\w)ui\b", r"\1V", char)
    char = re.sub(r"(\w)ou\b", r"\1B", char)
    char = re.sub(r"(\w)in\b", r"\1N", char)
    char = re.sub(r"(\w)v\b", r"\1Y", char)
    char = re.sub(r"(\w)[iu]a\b", r"\1W", char)
    char = re.sub(r"(\w)[uv]e\b", r"\1T", char)

    char = re.sub(r"(\b)ai\b", r"OL", char)
    char = re.sub(r"(\b)an\b", r"OJ", char)
    char = re.sub(r"(\b)ang\b", r"OH", char)
    char = re.sub(r"(\b)ao\b", r"OK", char)
    char = re.sub(r"(\b)ei\b", r"OZ", char)
    char = re.sub(r"(\b)en\b", r"OF", char)
    char = re.sub(r"(\b)er\b", r"OR", char)
    char = re.sub(r"(\b)ou\b", r"OB", char)
    char = re.sub(r"(\b)eng\b", r"OG", char)

    # char = re.sub(r"(\w)a", r"\1A", char)
    # char = re.sub(r"(\w)E", r"\1E", char)
    # char = re.sub(r"(\w)O", r"\1O", char)

    char = re.sub(r"(\b)a\b", r"OA", char)
    char = re.sub(r"(\b)e\b", r"OE", char)
    char = re.sub(r"(\b)o\b", r"OO", char)

    return char.lower()


def qp2udpn_code(word, quanpin):
    word = cc.convert(word)
    codes = []
    for i, char in enumerate(quanpin.split()):
        udpn = qp2udpn(char)        
        codes.append(udpn + '[' + char2scode[word[i]])
    
    return ' '.join(codes)

def xche2msup(char):
    assert(len(char.split()) == 1)
    char = re.sub(r"(\w)w\b", r"\1Z", char)
    char = re.sub(r"(\w)y\b", r"\1P", char)
    char = re.sub(r"(\w)p\b", r"\1X", char)
    char = re.sub(r"(\w)d\b", r"\1L", char)
    char = re.sub(r"([gkhvuirzcs])k", r"\1Y", char)
    char = re.sub(r"(\w)k\b", r"\1;", char)
    char = re.sub(r"(\w)l\b", r"\1D", char)
    char = re.sub(r"(\w)z\b", r"\1B", char)
    char = re.sub(r"(\w)x\b", r"\1W", char)
    char = re.sub(r"(\w)c\b", r"\1K", char)
    char = re.sub(r"([^aeiou])n", r"\1C", char)
    char = re.sub(r"(\w)b\b", r"\1N", char)
    char = re.sub(r"([nl])v\b", r"\1Y", char)

    char = re.sub(r"(\b)ai\b", r"OL", char)
    char = re.sub(r"(\b)an\b", r"OJ", char)
    char = re.sub(r"(\b)ah\b", r"OH", char)
    char = re.sub(r"(\b)ao\b", r"OK", char)
    char = re.sub(r"(\b)ei\b", r"OZ", char)
    char = re.sub(r"(\b)en\b", r"OF", char)
    char = re.sub(r"(\b)er\b", r"OR", char)
    char = re.sub(r"(\b)ou\b", r"OB", char)
    char = re.sub(r"(\b)aa\b", r"OA", char)
    char = re.sub(r"(\b)ee\b", r"OE", char)
    return char.lower()


def udpn2qp(char):
    assert(len(char.split()) == 1)
    char = char.upper()

    char = re.sub(r";", r"ing", char)
    
    char = re.sub(r"OL\b", r"ai", char)
    char = re.sub(r"OJ\b", r"an", char)
    char = re.sub(r"OH\b", r"ang", char)
    char = re.sub(r"OK\b", r"ao", char)
    char = re.sub(r"OZ\b", r"ei", char)
    char = re.sub(r"OF\b", r"en", char)
    char = re.sub(r"OR\b", r"er", char)
    char = re.sub(r"OB\b", r"ou", char)
    char = re.sub(r"OG\b", r"eng", char)
    char = re.sub(r"OA\b", r"a", char)
    char = re.sub(r"OE\b", r"e", char)
    char = re.sub(r"OO\b", r"o", char)

    
    char = re.sub(r"([NLJQX])D\b", r"\1iang", char)
    char = re.sub(r"(\w)D\b", r"\1uang", char)
    char = re.sub(r"([JQX])S\b", r"\1iong", char)
    char = re.sub(r"(\w)S\b", r"\1ong", char)

    char = re.sub(r"(\w)C\b", r"\1iao", char)
    char = re.sub(r"(\w)M\b", r"\1ian", char)
    char = re.sub(r"(\w)G\b", r"\1eng", char)
    char = re.sub(r"(\w)H\b", r"\1ang", char)
    char = re.sub(r"(\w)R\b", r"\1uan", char)
    ######################
    char = re.sub(r"([NL])Y\b", r"\1v", char)
    char = re.sub(r"(\w)Y\b", r"\1uai", char)
    ######################
    char = re.sub(r"(\w)Q\b", r"\1iu", char)
    char = re.sub(r"([^WYBPMF])O\b", r"\1uo", char) # except 咯 啰 
    char = re.sub(r"(\w)P\b", r"\1un", char)
    char = re.sub(r"(\w)F\b", r"\1en", char)
    char = re.sub(r"(\w)J\b", r"\1an", char)
    char = re.sub(r"(\w)K\b", r"\1ao", char)
    char = re.sub(r"(\w)L\b", r"\1ai", char)
    char = re.sub(r"(\w)Z\b", r"\1ei", char)
    char = re.sub(r"(\w)X\b", r"\1ie", char)
    char = re.sub(r"(\w)V\b", r"\1ui", char)
    char = re.sub(r"(\w)B\b", r"\1ou", char)
    char = re.sub(r"(\w)N\b", r"\1in", char)

    #################
    char = re.sub(r"([GKHVIU])W\b", r"\1ua", char)
    char = re.sub(r"(\w)W\b", r"\1ia", char)
    #################
    char = re.sub(r"([NL])T\b", r"\1ve", char)
    char = re.sub(r"(\w)T\b", r"\1ue", char)
    #################


    char = re.sub(r"U(\w)", r"sh\1", char)
    char = re.sub(r"I(\w)", r"ch\1", char)
    char = re.sub(r"V(\w)", r"zh\1", char)

    # char = re.sub(r"(\w)a", r"\1A", char)
    # char = re.sub(r"(\w)E", r"\1E", char)
    # char = re.sub(r"(\w)O", r"\1O", char)

    return char.lower()


In [8]:
print(qp2udpn_code("纤", "qian"))
print(qp2udpn_code("拽变", "zhuai bian\n"))
print(xche2msup("oo"))
print(xche2msup("en"))
print(xche2msup("ww"))
print(udpn2qp("j;"))

qm[sq
vy[fo bm[yy
oo
of
wz
jing


## Deprecated: 加载小鹤 weight 

In [6]:

def load_xche_weights():
    xche_weights = {}

    with open("./dicts/udpn/intermediates/edited_xche_cipn.txt", "r", encoding="UTF-8") as infile:
        lines = infile.readlines()
        for line in lines:
            line = line.strip()
            components = line.split()
            word = components[0]
            weight = int(components[1])
            if word in xche_weights:
                print(f"ERROR: {word}-{xche_weights[word]} collides with {word}-{weight}")
            else:
                xche_weights[word] = weight
    return xche_weights

## Load dicts (yaml re-editing, txt...)

In [9]:
from enum import Enum

def MakeWord(word, prons=None, weight=None, twice_converted=False, suspicious_pron=False):
    charinfos = [deepcopy(chardict[char][0]) for char in word]
    return WordInfo(word, charinfos, prons, weight, twice_converted, suspicious_pron)

def HaveWordInfo(word):
    for char in word:
        if char not in chardict:
            return False
    return True

class DictType(Enum): # TR = Traditional, SM = Simplified
    UDPN_SCODE_TR = 0
    QUANPIN_TR = 1
    WORDS_ONLY_SM = 2
    WORDS_WEIGHT_SM = 3
    XCHE_BIGRAMS_SM = 4

worddict = WordDict()


### Dict Filters

In [77]:
def LengthFilter(min_length, max_length):
    return lambda word: len(word) >= min_length and len(word) <= max_length

def ContainsSubStringFilter(sub_string, starting_positions=[]):
    if isinstance(starting_positions, int):
        starting_positions = [starting_positions]
    if starting_positions is None or len(starting_positions) == 0:
        return lambda word: sub_string in word
    else:
        return lambda word: any(word[i:].startswith(sub_string) if i >= 0 else word[:i].endswith(sub_string) for i in starting_positions)

def IsInDictFilter(worddict, start, end):
    return lambda word: word[start:end] in worddict

def SameCharFilter(positions):
    return lambda word: all(word[i] == word[positions[0]] for i in positions)

# def MaxNumFilter(max_num):
#     return lambda word, count=[0]: (count.append(count.pop() + 1), count[0] <= max_num)[1]

def MaxNumFilter(max_num):
    counter = {'count': 0}  # Using a mutable dictionary to store the count
    return lambda word: (True if counter['count'] < max_num else False, counter.update({'count': counter['count'] + 1}))[0]

def IsNameFilter():
    return lambda word: (len((result:=HanLP_MTL(word))["ner/msra"]) > 0 and "PERSON" in result["ner/msra"][0])

def MultipleWordsFilter():
    return lambda word: (len((result:=HanLP_MTL(word))["tok/fine"]) > 1)

def NotNameFilter():
    return lambda word: not (len((result:=HanLP_MTL(word))["ner/msra"]) > 0 and "PERSON" in result["ner/msra"][0])

def MultipleWordsAndNotNameFilter():
    return lambda word: (len((result:=HanLP_MTL(word))["tok/fine"]) > 1) and not (len(result["ner/msra"]) > 0 and "PERSON" in result["ner/msra"][0])

def SingleWordAndNotNameFilter():
    return lambda word: (len((result:=HanLP_MTL(word))["tok/fine"]) == 1) and not (len(result["ner/msra"]) > 0 and "PERSON" in result["ner/msra"][0])


#######################################################################
def ApplyFilters(word, all_filters, any_filters):
    return all(f(word) for f in all_filters) if len(all_filters) > 0 else True and any(f(word) for f in any_filters) if len(any_filters) > 0 else True

# all_filters = [LengthFilter(3, 3), MaxNumFilter(1000), IsNameFilter()]
all_filters = [LengthFilter(3, 3), NotNameFilter()]
# all_filters = []
# any_filters = [ContainsSubStringFilter("不"), SameCharFilter([0, 2])]
# any_filters = [IsInDictFilter(worddict, 0, 2)]
any_filters = []

True
True
True
True
True
False
False


In [26]:

def LoadDict(filename, filetype, target_dict, ref_dict, apply_filters=False):
    with open(filename, "r", encoding="UTF-8") as infile:
        for i, line in tqdm.tqdm(enumerate(infile.readlines())):
            line = line.strip()
            if filetype == DictType.UDPN_SCODE_TR:
                if not '[' in line:
                    continue
                components = line.split('\t')
                word_ori = components[0] 
                word_tr = word_ori # assume already traditional chinese
                prons_scode = components[1]
                weight = int(components[2])
                prons = [x[:2] for x in prons_scode.split()]

            if filetype == DictType.QUANPIN_TR:
                if not '[' in line:
                    continue
                components = line.split('\t')
                word_ori = components[0] 
                word_tr = word_ori # assume already traditional chinese
                prons_scode = components[1]
                weight = int(components[2])
                prons = [x[:x.find('[')] for x in prons_scode.split()]

            elif filetype == DictType.WORDS_ONLY_SM:
                if len(line) == 0:
                    continue
                word_ori = line.split('\t')
                word_tr = cc.convert(word_ori)
                # prons = [qp2udpn(x) for x in lazy_pinyin(word_ori)] # lazy_pinyin may mistake traditional Chinese
                prons = lazy_pinyin(word_ori) # lazy_pinyin may mistake traditional Chinese
                if "佛" in word_ori and "fu" in prons:
                    prons[prons.index("fu")] = "fo"
                weight = 1
                
            elif filetype == DictType.WORDS_WEIGHT_SM:
                if len(line) == 0:
                    continue
                components = line.split('\t')
                word_ori = components[0] 
                word_tr = cc.convert(word_ori) 
                prons = [qp2udpn(x) for x in lazy_pinyin(word_ori)] # lazy_pinyin may mistake traditional Chinese
                if "佛" in word_ori and "fu" in prons:
                    prons[prons.index("fu")] = "fo"
                weight = int(components[1])

            elif filetype == DictType.XCHE_BIGRAMS_SM:
                if len(line) == 0:
                    continue
                components = line.split('\t')
                word_ori = components[0] 
                word_tr = cc.convert(components[0]) 
                prons = [qp2udpn(x) for x in lazy_pinyin(word_ori)] # lazy_pinyin may mistake traditional Chinese
                if "佛" in word_ori and "fu" in prons:
                    prons[prons.index("fu")] = "fo"
                # prons = components[1].split()
                weight = xche_weights[word_ori]

            # Apply filters
            if apply_filters and not ApplyFilters(word_tr, all_filters, any_filters):
                continue

            # Check for identical entries and suspicious prons generated by lazy_pinyin
            flag_suspicious_pron = False
            if word_tr in ref_dict or word_tr in target_dict:
                flag_same_entry = False
                entries = (ref_dict[word_tr] if word_tr in ref_dict else []) + (target_dict[word_tr] if word_tr in target_dict else [])
                for entry in entries:
                    if ' '.join(entry.GetPron()) == ' '.join(prons):
                        # print(f"Same entry: {word}, {' '.join(entry.GetPron())} with {' '.join(prons)}")
                        flag_same_entry = True
                if flag_same_entry:
                    continue
                else:
                    # Different pron for same word, mark suspicious if pron is auto-gen-ed.
                    if filetype == DictType.WORDS_ONLY_SM or filetype == DictType.WORDS_WEIGHT_SM or filetype == DictType.XCHE_BIGRAMS_SM:
                        flag_suspicious_pron = True


                    
            if HaveWordInfo(word_tr):
                if word_tr not in target_dict:
                    target_dict[word_tr] = []
                target_dict[word_tr].append(MakeWord(word_tr, prons, weight, suspicious_pron=flag_suspicious_pron))
            else:
                char_by_char = ''.join([cc.convert(x) for x in word_tr])
                if HaveWordInfo(char_by_char):
                    if word_tr not in target_dict:
                        target_dict[word_tr] = []
                    target_dict[word_tr].append(MakeWord(char_by_char, prons, weight, twice_converted=True, suspicious_pron=flag_suspicious_pron))
                else:
                    print(f"ERROR: no word info for {word_ori}, {word_tr}, {char_by_char}.")
    target_dict.SortWeightPron()


def PrintDict(word_dict, outfilename, quanpin=False):
    result = sorted(word_dict.items(), key = lambda x: '0' if x[1][0].suspicious_pron else '1' + str(len(x[0])) + ' '.join(word_dict.GetPrimaryPron(x[0])))
    with open(outfilename, "w", encoding="UTF-8") as outfile:
        for word, wordinfos in result:
            # print(word)
            for wordinfo in wordinfos:
                if quanpin:
                    for char in wordinfo.chars:
                        char.pron = udpn2qp(char.pron)
                everything = [word, ' '.join([char.pron + '[' + char.scode for char in wordinfo.chars]), str(wordinfo.weight), \
                              " # CONVERTED TWICE" if wordinfo.twice_converted else None, \
                              " # SUSPICIOUS AUTO-GEN PRON" if wordinfo.suspicious_pron else None]
                everything = [x for x in everything if x is not None]
                # print(everything)
                outline = '\t'.join(everything) + '\n'
                outfile.writelines(outline)



In [45]:

outfilename = "./dicts/udpn/test_out.dict.yaml"

LoadDict("./dicts/udpn/words.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict, apply_filters=False)

# LoadDict("./dicts/udpn/udpn_simp-ext.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/v2/udpn_pure_same_base.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/v2/udpn_base_diff.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/intermediates/comment_fault_save/z5_n-of_new_base.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/v2/udpn_base.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/quanpin/dicts/quanpin/quanpin_words.dict.yaml", DictType.QUANPIN_TR, worddict, worddict)
# LoadDict("./dicts/scraper_intermediates/raw/lol.yaml", DictType.WORDS_ONLY_SM, worddict, worddict)

# PrintDict(worddict, outfilename)

229613it [00:01, 228685.67it/s]


In [78]:
new_words = WordDict()

# LoadDict("./dicts/udpn/intermediates/comment_fault_save/test_xche_slash_z3.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)

# LoadDict("./dicts/udpn/intermediates/edited_xche_cipn.txt", DictType.WORDS_WEIGHT_SM, new_words, worddict)
# LoadDict("./dicts/udpn/intermediates/words_wubi.txt", DictType.WORDS_ONLY_SM, new_words, worddict)
# LoadDict("./dicts/udpn/intermediates/important/xche_non_bigram.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/intermediates/important/wubi_4-.dict.yaml", DictType.UDPN_SCODE_TR, worddict, worddict)
# LoadDict("./dicts/udpn/intermediates/hand_edited_words/obnz_edited.dict.yaml", DictType.WORDS_ONLY_SM, worddict, worddict)
# LoadDict("./dicts/scraper_intermediates/raw/genshin.yaml", DictType.WORDS_ONLY_SM, new_words, worddict)
# LoadDict("C:/Users/Takusei/Desktop/输入法/dict_export/1001/ounei_hand_export.txt", DictType.UDPN_SCODE_TR, new_words, worddict, apply_filters=True)
LoadDict("./dicts/udpn/udpn_superime.dict.yaml", DictType.UDPN_SCODE_TR, new_words, worddict, apply_filters=True)

3712093it [00:28, 129826.55it/s]


In [79]:
print(len(new_words))
PrintDict(new_words, outfilename)

793


In [76]:
print(HanLP_MTL('鍾祥縣'))
# HanLP('阳性率').pretty_print()
# HanLP('一村一').pretty_print()
# HanLP('牙通牙').pretty_print()

{
  "tok/fine": [
    "鍾祥縣"
  ],
  "tok/coarse": [
    "鍾祥縣"
  ],
  "pos/ctb": [
    "NR"
  ],
  "pos/pku": [
    "ns"
  ],
  "pos/863": [
    "ns"
  ],
  "ner/msra": [
    ["鍾祥縣", "LOCATION", 0, 1]
  ],
  "ner/pku": [
    ["鍾祥縣", "nr", 0, 1]
  ],
  "ner/ontonotes": [
    ["鍾祥縣", "GPE", 0, 1]
  ],
  "srl": [],
  "dep": [
    [0, "root"]
  ],
  "sdp": [
    [[0, "Root"]]
  ],
  "con": [
    "TOP",
    [["NP", [["NR", ["鍾祥縣"]]]]]
  ]
}


In [18]:
print(len(worddict))
# 楊秀珠 陽性率

229413


## 通过 `fix_freq_char.txt` 造词典

In [None]:

with open("./fix_freq_char.txt", "r", encoding="UTF-8") as infile:
    lines = infile.readlines()
    for line in lines:
        if not '\t' in line:
            continue

        if '#' in line:
            line = line.split('#')[0].strip()
            if len(line) == 0:
                continue

        word = cc.convert(line.split()[0])
        pron = line.split()[1] # assert len = 2
        weight = int(line.split()[2])
        # if(chardict.GetPrimaryPron(word) != pron):
        #     print(f"ERROR: {word} has pron {chardict.GetPrimaryPron(word)} and {pron}")
        # print(f"{word}: {chardict.GetPrimaryPron(word)} - {pron},  [{chardict.GetScode(word)}")
        # print(f'{{text = "{word}", pron = "{pron}[{chardict.GetScode(word)}", weight = {weight}}},')
        print(f'["{pron}"] = {{text = "{word}", weight = {weight}}},')
        

## Fixed characters

In [None]:
pron_top3 = {}
for char, charinfos in chardict.items():
    for charinfo in charinfos:
        pron = charinfo.pron
        if pron not in pron_top3:
            pron_top3[pron] = []
        if len(pron_top3[pron]) < 3:
            pron_top3[pron].append((char, charinfo.weight))
            pron_top3[pron] = sorted(pron_top3[pron], key=lambda x: x[1], reverse = True)
        else:
            if charinfo.weight > pron_top3[pron][-1][1]:
                pron_top3[pron][-1] = (char, charinfo.weight)
                pron_top3[pron] = sorted(pron_top3[pron], key=lambda x: x[1], reverse = True)

list_pron_top3 = sorted(list(pron_top3.items()), key=lambda x: x[0])
for pron, top3 in list_pron_top3:
    words = ' '.join(x[0] for x in top3)
    weights = ' '.join(str(x[1]) for x in top3)
    print('\t'.join([pron, words, weights]))

# for pron, top3 in list_pron_top3:
#     quanpin = udpn2qp(pron)
#     print(pron + '\t' + quanpin)

## 垃坤

In [None]:
print(lazy_pinyin("了斷"))

### 这是干啥的

In [None]:
def match_func(match):
    return udpn2qp(match.group())
diff_chars = set()
with open("./dicts/udpn/udpn_weight.dict.yaml", "r", encoding="UTF-8") as infile:
    lines = infile.readlines()
    for line in lines:
        line = re.sub(r"\b[a-z;][a-z;]\[", match_func, line)
        print(line, end = '')

### 逐字转换的词，有没有转换？估计没有

In [None]:

with open("./dicts/udpn/intermediates/manual_check.yaml", "r", encoding="UTF-8") as infile:
    lines = infile.readlines()
    for line in lines:
        word = line.split()[0]
        convert = cc.convert(word)
        char_by_char = ''.join([cc.convert(x) for x in word])
        print(f"{word}, {convert}, {char_by_char}")

### 小鹤/自动生成的不同读音

In [None]:
diff_chars = set()
with open("./dicts/udpn/intermediates/xche_bigrams_diff.txt", "r", encoding="UTF-8") as infile:
    lines = infile.readlines()
    for line in lines:
        line = line.strip()
        components = line.split('\t')
        word, xche, auto = components[0], components[1], components[2]
        entries = [(word[0], xche[0:2], auto[0:2]), (word[1], xche[3:], auto[3:])]
        for entry in entries:
            if entry[1] != entry[2]:
                diff_chars.add(entry)
    
    output = sorted(list(diff_chars), key = lambda x: x[0])
    for entry in output:
        print('\t'.join([x for x in entry]))