# 正文: Optimal Phonetic Component Analysis

*(Tip: use **Run -> Run All Cells** to execute full analysis!)*

## Stage 1. Load Unihan data

In [65]:
import pandas as pd
import json
import re
import csv
import unicodedata
from collections import Counter

import typing

## Output files: don't change these
RESULT_CSV = "result/Stats.csv"

## Based on the Unihan data file format specs (you probably don't need to change these)
DATAFILE_PATH = "data/Unihan_%s.txt"
DATAFILE_DELIM = '\t'
DATAFILE_COMMENT = '#'
DATAFILE_BASENAMES = ['codepoint', 'prop', 'val'] # The values will get renamed according to PROPS_NAMES
DATA_DIR = "data/%s"

# Unihan data files we need (and what properties they provide of interest to us)
SOURCES = {
    "DictionaryLikeData": ["kPhonetic"],
    "Readings": ["kMandarin"],
    "Variants": ["kSimplifiedVariant"],
}
# What to rename each of the properties desired above [OPTIONAL]
PROPS_NAMES = {
    "kMandarin": "read",
    "kPhonetic": "write",
    "kSimplifiedVariant": "simplified",
}

# Will get populated by loadData()
DATAFRAMES = {}

def loadData():
    global DATAFRAMES

    for source in SOURCES:
        sourceDF = pd.read_csv(DATAFILE_PATH % source, delimiter=DATAFILE_DELIM, comment=DATAFILE_COMMENT,
                               header=None, names=DATAFILE_BASENAMES)
        for prop in SOURCES[source]:
            print(f"[*]\t {prop} [{source}]")
            DATAFRAMES[prop] = sourceDF[sourceDF['prop'] == prop] \
                .drop('prop', axis=1) \
                .rename(columns={'val': PROPS_NAMES.get(prop, prop)}) \
                .reset_index(drop=True)

## Stage 1: Data loading
print("[*] Loading data:")

# DataFrame Reading
loadData()
pinyin_df = DATAFRAMES['kMandarin']
radical_df = DATAFRAMES['kPhonetic']

t2s_table = DATAFRAMES['kSimplifiedVariant']
def t2s_convert(codepoint: str):
    conversion = t2s_table[t2s_table['codepoint'] == codepoint]
    if not len(conversion):
        return codepoint
    return conversion.iloc[0]['simplified'].split(' ')[0] # If there are multiple possible conversions, return the first
    # (most likely) one

def strip(s):
    s = s.translate(str.maketrans("ǖǘǚǜ", "vvvv"))
    normalized = unicodedata.normalize('NFD', s)
    return ''.join(ch for ch in normalized if unicodedata.category(ch) != 'Mn')

# Format pinyin_df
pinyin_df["read"] = pinyin_df["read"].apply(strip)
pinyin_df["read"] = pinyin_df["read"].astype(str).str.replace("v", "ü")
# Duplicate readings
pinyin_df["read"] = pinyin_df["read"].astype(str).str.strip()
pinyin_df = (pinyin_df.assign(read=pinyin_df["read"].str.split(r"[\s,]+")).explode("read").reset_index(drop=True))

# Format radical_df
radical_df["write"] = radical_df["write"].astype(str).str.replace("*", "", regex=False)
radical_df["write"] = radical_df["write"].astype(str).apply(lambda s: [w for w in s.split() if 'x' not in w]) # each radical
# may belong to multiple phonetic groups (e.g. it contains one phonetic component which is a subcomponent of another in it).
# we want to keep these group IDs as lists, so we can group each character multiple times below.
radical_df = radical_df.reset_index(drop=True)

# Make the phonetic dictionary
radical_df['char'] = radical_df['codepoint'].apply(t2s_convert).apply(lambda x: chr(int(x[2:], 16)))
phonetic_dict = radical_df.explode('write').groupby('write')['char'].apply(list).to_dict()

print("[+] Data loaded!")

[*] Loading data:
[*]	 kPhonetic [DictionaryLikeData]
[*]	 kMandarin [Readings]
[*]	 kSimplifiedVariant [Variants]
[+] Data loaded!


#### Examine

In [66]:
radical_df

Unnamed: 0,codepoint,write,char
0,U+3405,"[954, 1156]",㐅
1,U+340C,"[1471, 1545]",㐌
2,U+341F,[365],㐟
3,U+3421,[1631],㐡
4,U+3424,[63],㐤
...,...,...,...
22451,U+322E7,[1167],𲋧
22452,U+322E8,[537],𲋨
22453,U+322F9,[101],𲋹
22454,U+32313,[1257A],𲌓


In [67]:
phonetic_dict

{'1': ['丫', '吖'],
 '10': ['㡸',
  '㸲',
  '𮉣',
  '䝫',
  '䟭',
  '䩆',
  '乍',
  '作',
  '厏',
  '咋',
  '妰',
  '岝',
  '岞',
  '怍',
  '怎',
  '拃',
  '昨',
  '柞',
  '泎',
  '炸',
  '痄',
  '砟',
  '祚',
  '秨',
  '窄',
  '笮',
  '胙',
  '舴',
  '苲',
  '蚱',
  '诈',
  '诈',
  '迮',
  '酢',
  '𬬽',
  '阼',
  '𫗢',
  '鲊',
  '鲊',
  '𱌬',
  '𡗸',
  '𢂃',
  '𣬿',
  '𥅁',
  '𥹁',
  '𦥬',
  '𧯤',
  '𧲮',
  '𨋘',
  '𩂖',
  '𩢐',
  '𩬟',
  '𪌟'],
 '100': ['只', '矍', '蒦', '只'],
 '1000': ['旅', '派', '脉', '脉', '衇', '𠂢'],
 '1001': ['牌', '簰'],
 '1002': ['㓦',
  '㢶',
  '㪶',
  '㹮',
  '㼣',
  '佰',
  '咟',
  '帞',
  '弼',
  '栢',
  '洦',
  '瓸',
  '百',
  '皕',
  '粨',
  '絔',
  '袹',
  '貊',
  '銆',
  '陌',
  '𧻙',
  '𩢷'],
 '1002A': ['㴼', '宿', '樎', '𩘰'],
 '1003': ['伯',
  '帕',
  '帛',
  '廹',
  '怕',
  '拍',
  '柏',
  '泊',
  '珀',
  '白',
  '百',
  '皋',
  '皐',
  '穆',
  '粕',
  '胉',
  '舶',
  '迫',
  '铂',
  '魄',
  '𦫖',
  '𦫙'],
 '1003A': ['碧'],
 '1004': ['匐', '卜', '卜'],
 '1005': ['斑', '班', '𢴬', '𤡰'],
 '1005A': ['癍'],
 '1006': ['扳', '攀', '袢', '襻'],
 '1007': ['㱶',
  '嘭',
  '彭',
  '

## Stage 2: Analysis

In [68]:
# Merge and save to 
df = pd.merge(pinyin_df, radical_df, on="codepoint")

## Stage 2: Analysis

stats = (df.groupby('read')['write'].agg(lambda x: x.value_counts().idxmax()).reset_index())

In [69]:
print("[*] Analyzing...")

def mapper(ks):
    chars_grps = []
    for key in ks:
        key = key.strip()
        if key in phonetic_dict:
            chars_grps.append(phonetic_dict[key])
    return " | ".join([" ".join(chars) for chars in chars_grps])

stats["map"] = stats.iloc[:, 1].apply(mapper)

stats.to_csv(RESULT_CSV, index=False)

# Above only takes most common by total number of characters, below scales by frequency

freq_df = pd.read_csv(DATA_DIR % "SUBTLEX-CH.csv", sep=",")
freq_df["Word"] = freq_df["Word"].astype(str)
freq_df["WCount"] = pd.to_numeric(freq_df["WCount"], errors="coerce").fillna(0)

# Split multi-character words and divide frequency equally
char_freq = {}
for _, row in freq_df.iterrows():
    word = row["Word"].strip()
    freq = row["WCount"]
    if not word:
        continue
    share = freq / len(word)
    for ch in word:
        char_freq[ch] = char_freq.get(ch, 0) + share

def phonetic_sorter(row):
    read = row["read"]
    keys = row["write"]
    info = []

    for k in keys:
        if k not in phonetic_dict:
            print("[WARN]", k, "not in phonetic_dict!")
            continue

        chars = list(dict.fromkeys(phonetic_dict[k]))  # deduplicate
        same, different = [], []

        for ch in chars:
            freq = char_freq.get(ch, 0.0)
            # Find Mandarin readings for this character
            reads = pinyin_df.loc[pinyin_df["codepoint"] == f"U+{ord(ch):04X}", "read"].tolist()
            if read in reads:
                same.append((ch, freq))
            else:
                different.append((ch, freq))

        # Sort by descending frequency
        same.sort(key=lambda x: x[1], reverse=True)
        different.sort(key=lambda x: x[1], reverse=True)

        info.append({
            k: {
                "same": [c for c, _ in same],
                "different": [c for c, _ in different]
            }
        })

    return info
print("[*]\t Phonetically sorting...")
stats["correspondences"] = stats.apply(phonetic_sorter, axis=1)

# Drop the temporary frequency key
for dlist in stats["correspondences"]:
    for d in dlist:
        for v in d.values():
            v.pop("freq", None)

# Duplication removal
def dedup_string(s):
    """Remove duplicate characters from a string while preserving order."""
    seen = set()
    return ''.join(ch for ch in s if not (ch in seen or seen.add(ch)))

def dedup_phonetic_info(info_list):
    """Remove duplicate characters in 'same' and 'different' lists within the phonetic_info structure."""
    if not isinstance(info_list, list):
        return info_list
    cleaned = []
    for entry in info_list:
        if not isinstance(entry, dict):
            continue
        key = list(entry.keys())[0]
        val = entry[key]
        same = val.get("same", [])
        diff = val.get("different", [])
        # Preserve order
        val["same"] = list(dict.fromkeys(same))
        val["different"] = list(dict.fromkeys(diff))
        cleaned.append({key: val})
    return cleaned

# Apply deduplication
stats["map"] = stats["map"].astype(str).apply(dedup_string)
stats["correspondences"] = stats["correspondences"].apply(dedup_phonetic_info)

# Save final result
stats.to_csv(RESULT_CSV, index=False)

# Summarize below

def extract_summary(info_list):
    if not info_list or not isinstance(info_list, list):
        return pd.Series({
            "1st_write": None,
            "1st_same": [],
            "1st_different": [],
            "1st_same_len": 0,
            "1st_different_len": 0,
            "1st_congruency": 0,
            "most_congruency_write": None,
            "most_congruency_same": [],
            "most_congruency_different": [],
            "most_congruency_same_len": 0,
            "most_congruency_different_len": 0,
            "most_congruency_congruency": 0
        })

    # --- First (most common) phonetic index ---
    first_entry = info_list[0]
    first_key = list(first_entry.keys())[0]
    first_val = first_entry[first_key]
    same = first_val.get("same", [])
    diff = first_val.get("different", [])
    same_len = len(same)
    diff_len = len(diff)
    first_congruency = same_len / (same_len + diff_len) if (same_len + diff_len) > 0 else 0

    # --- Most congruent phonetic index ---
    best_key = None
    best_val = None
    best_congruency = -1
    for entry in info_list:
        k = list(entry.keys())[0]
        v = entry[k]
        s_len = len(v.get("same", []))
        d_len = len(v.get("different", []))
        c = s_len / (s_len + d_len) if (s_len + d_len) > 0 else 0
        if c > best_congruency:
            best_congruency = c
            best_key = k
            best_val = v

    best_same = best_val.get("same", []) if best_val else []
    best_diff = best_val.get("different", []) if best_val else []

    return pd.Series({
        "1st_write": first_key,
        "1st_same": ' '.join(same),
        "1st_different": ' '.join(diff),
        "1st_same_len": same_len,
        "1st_different_len": diff_len,
        "1st_congruency": first_congruency,
        "most_congruency_write": best_key,
        "most_congruency_same": ' '.join(best_same),
        "most_congruency_different": ' '.join(best_diff),
        "most_congruency_same_len": len(best_same),
        "most_congruency_different_len": len(best_diff),
        "most_congruency_congruency": best_congruency
    })

print("[*]\t Extracting summary...")
summary_df = stats["correspondences"].apply(extract_summary)
stats = pd.concat([stats, summary_df], axis=1)

stats.to_csv(RESULT_CSV, index=False)

print("[*] Analysis complete; saved to:", RESULT_CSV)

[*] Analyzing...
[*]	 Phonetically sorting...
[*]	 Extracting summary...
[*] Analysis complete; saved to: result/Stats.csv


## Final Output

In [70]:
stats

Unnamed: 0,read,write,map,correspondences,1st_write,1st_same,1st_different,1st_same_len,1st_different_len,1st_congruency,most_congruency_write,most_congruency_same,most_congruency_different,most_congruency_same_len,most_congruency_different_len,most_congruency_congruency
0,a,[3],䋪 啊娿婀屙疴锕阿𠥍𡹣𬮰,"[{'3': {'same': ['啊', '阿', '锕'], 'different': ...",3,啊 阿 锕,婀 䋪 娿 屙 疴 𠥍 𡹣 𬮰,3,8,0.272727,3,啊 阿 锕,婀 䋪 娿 屙 疴 𠥍 𡹣 𬮰,3,8,0.272727
1,ai,[994],𫣊 嗳嫒爱暧瑷𫉁𰾭叆,"[{'994': {'same': ['爱', '暧', '嗳', '𫣊', '嫒', '瑷...",994,爱 暧 嗳 𫣊 嫒 瑷 𫉁 𰾭 叆,,9,0,1.000000,994,爱 暧 嗳 𫣊 嫒 瑷 𫉁 𰾭 叆,,9,0,1.000000
2,an,[995],㝧 䀂䅁䢿侒咹姲安峖按晏案桉氨洝胺荌铵鞌鞍𱂨𫛩𡪙𦛅𧵨𨴣䯃,"[{'995': {'same': ['安', '案', '按', '胺', '氨', '鞍...",995,安 案 按 胺 氨 鞍 铵 桉 䀂 䅁 䢿 侒 峖 洝 荌 鞌 𡪙,晏 咹 㝧 姲 𱂨 𫛩 𦛅 𧵨 𨴣 䯃,17,10,0.629630,995,安 案 按 胺 氨 鞍 铵 桉 䀂 䅁 䢿 侒 峖 洝 荌 鞌 𡪙,晏 咹 㝧 姲 𱂨 𫛩 𦛅 𧵨 𨴣 䯃,17,10,0.629630
3,ang,[1528],㼜 㿮䇦䒋𱃵佒央姎岟怏抰映柍殃泱炴盎眏秧紻胦英𬨄𫓭雵鞅𩧫鸯𣃝𧲱𧵌𩲴𪎞𱌀𪚻,"[{'1528': {'same': ['盎', '㼜', '䇦'], 'different...",1528,盎 㼜 䇦,英 映 央 殃 鸯 秧 怏 泱 㿮 䒋 𱃵 佒 姎 岟 抰 柍 炴 眏 紻 胦 𬨄 𫓭 雵 ...,3,32,0.085714,1528,盎 㼜 䇦,英 映 央 殃 鸯 秧 怏 泱 㿮 䒋 𱃵 佒 姎 岟 抰 柍 炴 眏 紻 胦 𬨄 𫓭 雵 ...,3,32,0.085714
4,ao,[966],傲 厫嗷廒敖滶熬獒璈聱謷赘遨鏊骜鳌鼇𰾧,"[{'966': {'same': ['傲', '熬', '嗷', '敖', '遨', '獒...",966,傲 熬 嗷 敖 遨 獒 鏊 鳌 骜 厫 廒 滶 璈 聱 謷 鼇,赘 𰾧,16,2,0.888889,966,傲 熬 嗷 敖 遨 獒 鏊 鳌 骜 厫 廒 滶 璈 聱 謷 鼇,赘 𰾧,16,2,0.888889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,zu,[97],㚗 㡹㸖䏣䖕䢐䢸䣯䪶䯶䱉且伹俎冝刞助咀坥姐宜岨徂怚抯柤查査殂沮爼狙珇疽砠祖租笡粗组罝耝舋苴虘...,"[{'97': {'same': ['组', '阻', '租', '祖', '诅', '俎'...",97,组 阻 租 祖 诅 俎 䖕 爼 珇 靻 𤱌 𩲲,查 且 助 姐 粗 宜 沮 狙 蛆 疽 咀 龃 殂 罝 苴 趄 㚗 㡹 㸖 䏣 䢐 䢸 䣯 ...,12,79,0.131868,97,组 阻 租 祖 诅 俎 䖕 爼 珇 靻 𤱌 𩲲,查 且 助 姐 粗 宜 沮 狙 蛆 疽 咀 龃 殂 罝 苴 趄 㚗 㡹 㸖 䏣 䢐 䢸 䣯 ...,12,79,0.131868
404,zuan,[28],𰃆 𰉄攒𪴙瓒缵臜𬤮賛赞趱酂鉆钻𰿆,"[{'28': {'same': ['钻', '缵', '𰿆'], 'different':...",28,钻 缵 𰿆,赞 攒 𰃆 𰉄 𪴙 瓒 臜 𬤮 賛 趱 酂 鉆,3,12,0.200000,28,钻 缵 𰿆,赞 攒 𰃆 𰉄 𪴙 瓒 臜 𬤮 賛 趱 酂 鉆,3,12,0.200000
405,zui,[291],㠑 嶵檌罪,"[{'291': {'same': ['罪', '㠑', '嶵', '檌'], 'diffe...",291,罪 㠑 嶵 檌,,4,0,1.000000,291,罪 㠑 嶵 檌,,4,0,1.000000
406,zun,[270],䔿 僔噂墫尊嶟撙樽𰬺罇蹲遵𨱔鳟𫜄𥊭𥢎𩯄,"[{'270': {'same': ['尊', '遵', '鳟', '樽', '䔿', '僔...",270,尊 遵 鳟 樽 䔿 僔 噂 墫 嶟 撙 𰬺 罇 𨱔 𫜄 𥊭 𥢎 𩯄,蹲,17,1,0.944444,270,尊 遵 鳟 樽 䔿 僔 噂 墫 嶟 撙 𰬺 罇 𨱔 𫜄 𥊭 𥢎 𩯄,蹲,17,1,0.944444
