In [88]:
import json
import numpy as np
import pandas as pd
from pypinyin import pinyin

def get_cy_dt(cy_path=r"D:\GitHub\chinese-xinhua\data\idiom.json"):
    with open(cy_path, "r", encoding="utf-8") as fp:
        dt = json.load(fp)
    return dt
dt = get_cy_dt()
dt[0]

{'derivation': '语出《法华经·法师功德品》下至阿鼻地狱。”',
 'example': '但也有少数意志薄弱的……逐步上当，终至堕入～。★《上饶集中营·炼狱杂记》',
 'explanation': '阿鼻梵语的译音，意译为无间”，即痛苦无有间断之意。常用来比喻黑暗的社会和严酷的牢狱。又比喻无法摆脱的极其痛苦的境地。',
 'pinyin': 'ā bí dì yù',
 'word': '阿鼻地狱',
 'abbreviation': 'abdy'}

In [143]:
def get_cy_df(cy_json):
    df = pd.DataFrame(cy_json)
    
    df.loc[df["word"]=="鸡犬之声相闻，老死不相往来", "pinyin"] = "jī quǎn zhī shēng xiāng wén lǎo sǐ bù xiāng wǎng lái"
    df.loc[df["word"]=="各人自扫门前雪，莫管他家瓦上霜", "pinyin"] = "gè rén zì sǎo mén qián xuě mò guǎn tā jiā wǎ shàng shuāng"
    df.loc[df["word"]=="抓耳搔腮", "pinyin"] = "zhuā ěr sāo sāi"
    df.loc[df["word"]=="只许州官放火，不许百姓点灯", "pinyin"] = "zhǐ xǔ zhōu guān fàng huǒ，bù xǔ bǎi xìng diǎn dēng"
    
    tmp_df = df["pinyin"].str.split(n=-1, expand=True)
    df["head"] = tmp_df[0]
    df["tail"] = tmp_df[3]
    
    df = df[["word", "head", "tail"]]
    df = df.sort_values("head").reset_index(drop=True)
    
    assert df[df["head"].str.len()>6].empty
    
    return df

df = get_cy_df(dt)
df.head()

Unnamed: 0,word,head,tail
0,变色之言,biàn,yán
1,变名易姓,biàn,xìng
2,变脸变色,biàn,sè
3,变醨养瘠,biàn,jí
4,变幻无穷,biàn,qióng


In [150]:
def get_heads_idx(cy_df):
    cy_df = cy_df.reset_index()
    cy_df = cy_df.groupby("head").agg({"index": "first"})
    return cy_df["index"].to_dict()
heads_idx = get_heads_idx(df)
heads_idx

{'biàn': 0,
 'biào': 48,
 'bián': 49,
 'bié': 51,
 'biān': 96,
 'biāo': 111,
 'biǎn': 124,
 'biǎo': 125,
 'bu': 138,
 'bà': 139,
 'bài': 143,
 'bàn': 170,
 'bàng': 223,
 'bào': 234,
 'bá': 298,
 'bái': 352,
 'báng': 454,
 'báo': 455,
 'bèi': 457,
 'bèn': 500,
 'bì': 504,
 'bìn': 647,
 'bìng': 648,
 'bí': 674,
 'bò': 691,
 'bó': 692,
 'bù': 740,
 'bā': 1355,
 'bān': 1403,
 'bāng': 1426,
 'bāo': 1431,
 'bēi': 1444,
 'bēn': 1501,
 'bēng': 1508,
 'běi': 1511,
 'běn': 1526,
 'bī': 1538,
 'bīan': 1542,
 'bīn': 1543,
 'bīng': 1554,
 'bō': 1637,
 'bū': 1679,
 'bǎ': 1683,
 'bǎi': 1692,
 'bǎn': 1870,
 'bǎng': 1876,
 'bǎo': 1877,
 'bǐ': 1907,
 'bǐng': 1984,
 'bǒ': 2009,
 'bǔ': 2012,
 'chuàn': 2030,
 'chuàng': 2032,
 'chuán': 2042,
 'chuáng': 2066,
 'chuí': 2074,
 'chuò': 2123,
 'chuāi': 2133,
 'chuān': 2135,
 'chuāng': 2148,
 'chuī': 2152,
 'chuō': 2193,
 'chuǎi': 2199,
 'chuǎn': 2201,
 'chuǎng': 2205,
 'chà': 2207,
 'chàng': 2208,
 'chá': 2221,
 'chái': 2235,
 'chán': 2252,
 'cháng': 2267,
 'chá

In [212]:
# def get_full(word):
#     last_pinyin = pinyin(word)[-1][0]
#     print(last_pinyin)
#     full_list = [w["word"] for w in dt if w["pinyin"].split(" ")[0] == last_pinyin]
#     return full_list
# print(get_full("笑里藏刀"))

def get_full(word, verbose=False):
    last_pinyin = pinyin(word)[-1][0]
    if verbose:
        print(last_pinyin)
    sub_df = df.iloc[heads_idx[last_pinyin]:, :]
    sub_df = sub_df[sub_df["head"] == last_pinyin]
    if verbose:
        print(f"GET {sub_df.shape[0]} results:")
    return sub_df["word"].tolist()
print(get_full("笑里藏刀", verbose=True))

dāo
GET 17 results:
['刀耕火耘', '刀耕火种', '刀光剑影', '刀光血影', '刀过竹解', '刀锯鼎镬', '刀锯斧钺', '刀枪剑戟', '刀头剑首', '刀山火海', '刀山剑树', '刀头舔蜜', '刀头燕尾', '刀下留人', '刀耕火耨', '刀枪入库', '刀俎余生']


In [213]:
def get_one(word):
    last_pinyin = pinyin(word)[-1][0]
    print(last_pinyin)
    sub_df = df.iloc[heads_idx[last_pinyin]:heads_idx[last_pinyin]+1, 0]
    return sub_df.to_string(header=False,index=False).strip()
print(get_one("刀山火海"))

hǎi
海角天隅


In [223]:
def get_random_one(word):
    sub_df = get_full(word)
    return sub_df[np.random.randint(0, len(sub_df), dtype=np.int16)]
print(get_random_one("笑里藏刀"))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

刀头舔蜜


In [227]:
np.random.randint(0, 5, dtype=np.int16)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

3

In [234]:
def get_seq(word, max_len=5, verbose=False):
    word_lst = []
    for i in range(max_len):
        word = get_random_one(word)
        word_lst.append(word)
        if verbose:
            print(f"[ADDED]: {word}")
    return word_lst

while True:
    try:
        word_seq = get_seq("山穷水尽", max_len=50, verbose=True)
        break
    except Exception as e:
        print(f"[ERROR]: {e}")
word_seq

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 进贤任能


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 能者为师


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 诗以言志


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 彘肩斗酒


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 九流三教


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 较德焯勤


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 琴瑟失调


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 调舌弄唇


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 唇齿相依


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 一献三酬


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 愁红怨绿


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 虑周藻密


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 密云不雨


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 语重心沉


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 陈谷子烂芝麻


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 麻雀虽小，五脏俱全


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 权重秩卑


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 悲歌击筑


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 著述等身


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 深谷为陵


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 菱角磨作鸡头


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 投传而去


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 去危就安


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 安于故俗，溺于旧闻


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 文不对题


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 蹄间三寻


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 循名课实


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 十室九匮


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 愧悔无地


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 地狱变相


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 像心如意


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 逸群之才


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 才高意广


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 广种薄收


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 收园结果


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 果于自信


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 信而有征


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 铮铮佼佼


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 矫世变俗


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 俗不可耐


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 耐人寻味


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 未足轻重


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 重赏之下，必有勇夫


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 肤如凝脂


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 之死靡它


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 他山之石，可以攻玉


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 鬻儿卖女


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 女长须嫁


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 价等连城


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[ADDED]: 惩一儆百


['进贤任能',
 '能者为师',
 '诗以言志',
 '彘肩斗酒',
 '九流三教',
 '较德焯勤',
 '琴瑟失调',
 '调舌弄唇',
 '唇齿相依',
 '一献三酬',
 '愁红怨绿',
 '虑周藻密',
 '密云不雨',
 '语重心沉',
 '陈谷子烂芝麻',
 '麻雀虽小，五脏俱全',
 '权重秩卑',
 '悲歌击筑',
 '著述等身',
 '深谷为陵',
 '菱角磨作鸡头',
 '投传而去',
 '去危就安',
 '安于故俗，溺于旧闻',
 '文不对题',
 '蹄间三寻',
 '循名课实',
 '十室九匮',
 '愧悔无地',
 '地狱变相',
 '像心如意',
 '逸群之才',
 '才高意广',
 '广种薄收',
 '收园结果',
 '果于自信',
 '信而有征',
 '铮铮佼佼',
 '矫世变俗',
 '俗不可耐',
 '耐人寻味',
 '未足轻重',
 '重赏之下，必有勇夫',
 '肤如凝脂',
 '之死靡它',
 '他山之石，可以攻玉',
 '鬻儿卖女',
 '女长须嫁',
 '价等连城',
 '惩一儆百']