In [2]:
import json, math
import pandas as pd
import numpy as np
from collections import Counter

# Load your JSON
with open("all_Lin_Shu_token_pos_ner_by_subfolder.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# --- Functions ---
def ttr(tokens):
    return len(set(tokens)) / len(tokens) if tokens else np.nan

def root_ttr(tokens):
    return len(set(tokens)) / math.sqrt(len(tokens)) if tokens else np.nan

def mattr(tokens, window_size=500):
    n = len(tokens)
    if n == 0:
        return np.nan
    if n < window_size:
        return len(set(tokens)) / n
    counter = Counter(tokens[:window_size])
    uniq_counts = [len(counter)]
    for i in range(window_size, n):
        out_tok = tokens[i - window_size]
        in_tok = tokens[i]
        counter[out_tok] -= 1
        if counter[out_tok] == 0:
            del counter[out_tok]
        counter[in_tok] += 1
        uniq_counts.append(len(counter))
    return np.mean([uc / window_size for uc in uniq_counts])

# --- Compute metrics ---
results = []
for _, row in df.iterrows():
    tokens = row["tokens"]
    results.append({
        "filename": row["filename"],
        "group": row["group"],
        "n_tokens": len(tokens),
        "TTR": ttr(tokens),
        "Root_TTR": root_ttr(tokens),
        "MATTR_300": mattr(tokens, 300),
        "MATTR_500": mattr(tokens, 500),
        "MATTR_800": mattr(tokens, 800)
    })

metrics_df = pd.DataFrame(results)

# --- Aggregate by group ---
agg_df = (metrics_df
          .groupby("group")
          .agg({"n_tokens": ["count","mean","median"],
                "TTR": "mean",
                "Root_TTR": "mean",
                "MATTR_300": "mean",
                "MATTR_500": "mean",
                "MATTR_800": "mean"})
          .reset_index())

# Save outputs
metrics_df.to_csv("linshu_mattr_metrics_per_doc.csv", index=False)
agg_df.to_csv("linshu_mattr_metrics_by_group.csv", index=False)

print("Per-document metrics saved to linshu_mattr_metrics_per_doc.csv")
print("By-group averages saved to linshu_mattr_metrics_by_group.csv")


Per-document metrics saved to linshu_mattr_metrics_per_doc.csv
By-group averages saved to linshu_mattr_metrics_by_group.csv


In [4]:
import json
import pandas as pd
import math

# === Load your JSON file ===
with open("all_Lin_Shu_token_pos_ner_by_subfolder.json", "r", encoding="utf-8") as f:
    data = json.load(f)

TARGET_SIZE = 100  # desired avg chunk size

def chunk_evenly(tokens, pos_tags, target_size=100):
    """
    Split tokens/pos into K contiguous chunks so that:
      - all tokens are used (no drop)
      - chunk sizes differ by at most 1
      - average size ≈ target_size
    """
    n = len(tokens)
    assert n == len(pos_tags), "tokens and pos length mismatch"

    # Number of chunks ~ n / target_size (at least 1)
    k = max(1, round(n / target_size))

    # Distribute sizes as evenly as possible: first 'remainder' chunks get +1
    base = n // k
    rem  = n % k
    sizes = [base + 1] * rem + [base] * (k - rem)

    chunks = []
    start = 0
    for j, sz in enumerate(sizes, 1):
        end = start + sz
        chunks.append((
            j,
            tokens[start:end],
            pos_tags[start:end]
        ))
        start = end

    # Sanity checks
    assert sum(len(c[1]) for c in chunks) == n
    return chunks

# === Chunking process (no tokens ditched; all chunks ~100) ===
chunked_rows = []

for entry in data:
    tokens   = entry["tokens"]
    pos_tags = entry["pos"]
    group    = entry["group"]
    filename = entry["filename"]

    for chunk_id, tok_chunk, pos_chunk in chunk_evenly(tokens, pos_tags, TARGET_SIZE):
        chunked_rows.append({
            "filename": f"{filename}_chunk{chunk_id}",
            "group": group,
            "tokens": tok_chunk,
            "pos": pos_chunk,
            "chunk_id": chunk_id,
            "chunk_len": len(tok_chunk),
        })

df_chunks = pd.DataFrame(chunked_rows)
print(f"✅ Created {len(df_chunks)} chunks from {len(data)} texts.")
print(df_chunks["chunk_len"].describe())

# === Save ===
df_chunks.to_json("chunked_ave100_token_blocks.json", force_ascii=False, orient="records", indent=2)
df_chunks.to_csv("chunked_ave100_token_blocks.csv", index=False, encoding="utf-8-sig")


✅ Created 14257 chunks from 29 texts.
count    14257.000000
mean       100.003577
std          0.237602
min         99.000000
25%        100.000000
50%        100.000000
75%        100.000000
max        101.000000
Name: chunk_len, dtype: float64


In [8]:
import pandas as pd

# Load chunked data (has: filename, group, tokens, pos, chunk_id, chunk_len)
df = pd.read_json("chunked_ave100_token_blocks.json")

# 1) Ensure token_count (for normalization)
if "token_count" not in df.columns:
    df["token_count"] = df["tokens"].apply(len)

# 2) Count pronouns by POS tag (= PN) per chunk
PRON_TAG = "PN"
df["pron_count"] = df["pos"].apply(lambda tags: sum(1 for t in tags if t == PRON_TAG))

# 3) Relative pronoun frequency per chunk
df["pron_rel"] = df["pron_count"] / df["token_count"]

# 4) Compare by group (mean/median/std + 95% CI)
group_summary = (
    df.groupby("group")
      .agg(n_chunks=("filename","count"),
           mean_rel=("pron_rel","mean"),
           median_rel=("pron_rel","median"),
           std_rel=("pron_rel","std"))
      .reset_index()
)
n_per_group = df.groupby("group")["pron_rel"].count().values
group_summary["se_rel"] = group_summary["std_rel"] / (n_per_group ** 0.5)
group_summary["ci95_low"]  = group_summary["mean_rel"] - 1.96 * group_summary["se_rel"]
group_summary["ci95_high"] = group_summary["mean_rel"] + 1.96 * group_summary["se_rel"]

print("=== Relative pronoun frequency by group ===")
print(group_summary)

# 5) Top 5 chunks per group by pronoun density
top5 = (
    df.sort_values("pron_rel", ascending=False)
      .groupby("group", group_keys=False)
      .head(5)
)[["filename","group","token_count","pron_count","pron_rel"]]

print("\n=== Top 5 chunks per group (by relative pronoun frequency) ===")
print(top5)

# 6) (Optional) Save outputs
# df.to_csv("pron_rel_per_chunk.csv", index=False, encoding="utf-8-sig")
# group_summary.to_csv("pron_rel_by_group.csv", index=False, encoding="utf-8-sig")


=== Relative pronoun frequency by group ===
               group  n_chunks  mean_rel  median_rel   std_rel    se_rel  \
0    Original Novels      1211  0.035256        0.03  0.024037  0.000691   
1  Translated Novels     13046  0.076142        0.07  0.038198  0.000334   

   ci95_low  ci95_high  
0  0.033902   0.036610  
1  0.075487   0.076798  

=== Top 5 chunks per group (by relative pronoun frequency) ===
                              filename              group  token_count  \
1917     Aiji jinta poushi ji_chunk707  Translated Novels           99   
2424    Aisilan qingxia zhuan_chunk465  Translated Novels          100   
8536  Sanqian nian yan shi ji_chunk441  Translated Novels          100   
1688     Aiji jinta poushi ji_chunk478  Translated Novels          100   
4670         Gu gui yi jin ji_chunk401  Translated Novels          100   
1063       Yuan hai ling guang_chunk52    Original Novels          100   
556         Jinghua bi xue lu_chunk175    Original Novels           99

In [16]:
import pandas as pd
from collections import Counter

# Load chunks (needs: filename, group, tokens, pos, chunk_id, chunk_len)
df = pd.read_json("chunked_ave100_token_blocks.json")

# Your function-word POS tags (exact list you provided)
fw_tags = [
    "AS","BA","CC","CS","DEC","DEG","DER","DEV","DT","ETC",
    "FW","IC","IJ","LB","LC","MSP","PN","P","PU","SB","SP",
    
]

# Ensure token_count (normalizer)
if "token_count" not in df.columns:
    df["token_count"] = df["chunk_len"] if "chunk_len" in df.columns else df["tokens"].apply(len)

# Count function-word tokens by POS and compute relative frequency
df["function_word_count"] = df["pos"].apply(lambda tags: sum(1 for t in tags if t in fw_tags))
df["function_word_rel"] = df["function_word_count"] / df["token_count"]

# Get the index of the max-rel chunk per group
idxmax = df.groupby("group")["function_word_rel"].idxmax()
top_chunks = df.loc[idxmax].copy()

# Print details for each group's top chunk
for _, row in top_chunks.iterrows():
    # Collect function-word (token, POS) pairs
    fw_pairs = [(tok, tag) for tok, tag in zip(row["tokens"], row["pos"]) if tag in fw_tags]
    pos_counts = Counter(tag for _, tag in fw_pairs)

    print(f"=== Group: {row['group']} ===")
    print(f"Filename: {row['filename']}")
    print(f"Chunk ID: {row.get('chunk_id')}, tokens: {row['token_count']}, "
          f"function words: {row['function_word_count']} "
          f"({row['function_word_rel']:.3f})")
    print("Function-word POS breakdown:", dict(pos_counts))
    print("Function-word tokens (token/POS):")
    print(", ".join(f"{tok}/{tag}" for tok, tag in fw_pairs))
    print()


=== Group: Original Novels ===
Filename: Jinghua bi xue lu_chunk126
Chunk ID: 126, tokens: 100, function words: 30 (0.300)
Function-word POS breakdown: {'SP': 4, 'PN': 11, 'P': 4, 'DT': 2, 'DEG': 2, 'DEC': 2, 'CS': 1, 'LB': 1, 'MSP': 2, 'CC': 1}
Function-word tokens (token/POS):
也/SP, 我/PN, 以/P, 之/PN, 諸/DT, 於/P, 何/PN, 吾/PN, 之/PN, 以/P, 之/DEG, 我/PN, 於/P, 之/DEC, 我/PN, 之/DEC, 我/PN, 若/CS, 爲/LB, 所/MSP, 矣/SP, 此/DT, 之/DEG, 而/MSP, 我/PN, 此/PN, 我/PN, 也/SP, 然/SP, 及/CC

=== Group: Translated Novels ===
Filename: Aiji jinta poushi ji_chunk478
Chunk ID: 478, tokens: 100, function words: 42 (0.420)
Function-word POS breakdown: {'PN': 24, 'SP': 4, 'MSP': 1, 'DT': 2, 'P': 6, 'DEG': 4, 'CS': 1}
Function-word tokens (token/POS):
何/PN, 也/SP, 余/PN, 爾/PN, 而/MSP, 此/DT, 與/P, 汝/PN, 吾/PN, 之/PN, 吾/PN, 之/PN, 此/DT, 與/P, 汝/PN, 者/SP, 汝/PN, 於/P, 吾/PN, 與/P, 爾/PN, 爾/PN, 吾/PN, 之/DEG, 汝/PN, 之/PN, 汝/PN, 我/PN, 也/SP, 汝/PN, 與/P, 我/PN, 此/PN, 之/DEG, 汝/PN, 之/PN, 耶/SP, 雖然/CS, 汝/PN, 以/P, 之/DEG, 之/DEG



In [20]:
import pandas as pd
from collections import Counter

# --- Load chunks ---
df = pd.read_json("chunked_ave100_token_blocks.json")

# --- Your function-word POS tags (exact set you provided) ---
fw_tags = [
    "AS","BA","CC","CS","DEC","DEG","DER","DEV","DT","ETC",
    "FW","IC","IJ","LB","LC","MSP","P","PN","PU","SB","SP",
    
]

# --- Token count per chunk (100-token chunks -> equals chunk_len) ---
if "token_count" not in df.columns:
    df["token_count"] = df["chunk_len"] if "chunk_len" in df.columns else df["tokens"].apply(len)

# --- POS counts per chunk -> wide columns POS_AD, POS_VV, ... ---
def pos_counter(tags):
    c = Counter(map(str, tags))
    return {f"POS_{k}": v for k, v in c.items()}

pos_count_df = df["pos"].apply(pos_counter).apply(pd.Series).fillna(0).astype(int)

# --- Attach counts back ---
dfc = pd.concat([df.reset_index(drop=True), pos_count_df.reset_index(drop=True)], axis=1)

# --- Sum function-word counts and compute relative frequency ---
fw_pos_cols = [f"POS_{t}" for t in fw_tags if f"POS_{t}" in dfc.columns]
dfc["function_word_count"] = dfc[fw_pos_cols].sum(axis=1) if fw_pos_cols else 0
dfc["function_word_rel"] = dfc["function_word_count"] / dfc["token_count"]  # with 100-token chunks, this ≈ count/100

# (Optional) per-tag relatives for inspection
for col in fw_pos_cols:
    dfc[col.replace("POS_", "REL_")] = dfc[col] / dfc["token_count"]

# --- Compare by group ---
group_summary = (
    dfc.groupby("group")
       .agg(n_chunks=("filename","count"),
            mean_rel=("function_word_rel","mean"),
            median_rel=("function_word_rel","median"),
            std_rel=("function_word_rel","std"))
       .reset_index()
)

# 95% CI for the mean (normal approx)
n_per_group = dfc.groupby("group")["function_word_rel"].count()
group_summary["se_rel"] = group_summary["std_rel"] / n_per_group.values**0.5
group_summary["ci95_low"]  = group_summary["mean_rel"] - 1.96 * group_summary["se_rel"]
group_summary["ci95_high"] = group_summary["mean_rel"] + 1.96 * group_summary["se_rel"]

print("=== Function-word relative frequency by group ===")
print(group_summary)

# --- Top 5 chunks per group ---
top5 = (
    dfc.sort_values("function_word_rel", ascending=False)
       .groupby("group", group_keys=False)
       .head(5)
)
print("\n=== Top 5 chunks per group (by function-word relative frequency) ===")
print(top5[["filename","group","token_count","function_word_count","function_word_rel"]])

# --- (Optional) Save ---
# dfc.to_csv("function_word_relfreq_per_chunk_100.csv", index=False, encoding="utf-8-sig")
# group_summary.to_csv("function_word_relfreq_by_group_100.csv", index=False, encoding="utf-8-sig")

# --- (Optional) Significance test if exactly two groups ---
# from scipy.stats import mannwhitneyu
# g = {k: v["function_word_rel"].values for k, v in dfc.groupby("group")}
# if len(g) == 2:
#     (g1, g2) = list(g.keys())
#     print(mannwhitneyu(g[g1], g[g2], alternative="two-sided"))

=== Function-word relative frequency by group ===
               group  n_chunks  mean_rel  median_rel   std_rel    se_rel  \
0    Original Novels      1211  0.142143        0.14  0.041543  0.001194   
1  Translated Novels     13046  0.204969        0.20  0.051226  0.000448   

   ci95_low  ci95_high  
0  0.139803   0.144483  
1  0.204090   0.205848  

=== Top 5 chunks per group (by function-word relative frequency) ===
                            filename              group  token_count  \
1688   Aiji jinta poushi ji_chunk478  Translated Novels          100   
14090         Zhongru dulou_chunk424  Translated Novels          100   
1920   Aiji jinta poushi ji_chunk710  Translated Novels           99   
1917   Aiji jinta poushi ji_chunk707  Translated Novels           99   
1918   Aiji jinta poushi ji_chunk708  Translated Novels           99   
507       Jinghua bi xue lu_chunk126    Original Novels          100   
731          Jinguo Yangqiu_chunk166    Original Novels           99   


In [22]:
import pandas as pd
from collections import Counter

# --- Load chunks (needs: filename, group, tokens, pos, chunk_id, chunk_len) ---
df = pd.read_json("chunked_100_token_blocks.json")

# --- Function-word POS tags (your exact set) ---
fw_tags = [
    "AS","BA","CC","CS","DEC","DEG","DER","DEV","DT","ETC",
    "FW","IC","IJ","LB","LC","MSP","P","PN","PU","SB","SP",
    
]

# --- Ensure token_count for normalization ---
if "token_count" not in df.columns:
    df["token_count"] = df["chunk_len"] if "chunk_len" in df.columns else df["tokens"].apply(len)

# --- Compute function-word relative frequency per chunk ---
df["function_word_count"] = df["pos"].apply(lambda tags: sum(1 for t in tags if t in fw_tags))
df["function_word_rel"] = df["function_word_count"] / df["token_count"]

# --- Helper to pretty-print a chunk ---
def print_chunk(row):
    print(f"=== Group: {row['group']} ===")
    print(f"Filename: {row['filename']}  |  Chunk ID: {row.get('chunk_id')}")
    print(f"Tokens in chunk: {row['token_count']}  |  Function words: {row['function_word_count']} "
          f"({row['function_word_rel']:.3f})\n")

    # Print the whole chunk as text (Chinese: no spaces) and with spaces (for inspection)
    text_no_space = "".join(row["tokens"])
    text_with_space = " ".join(row["tokens"])

    print("— Chunk as continuous text (no spaces):")
    print(text_no_space, "\n")
    print("— Chunk with spaces (for inspection):")
    print(text_with_space, "\n")

    # Print all token/POS pairs
    print("— token/POS pairs:")
    print(" ".join(f"{tok}/{tag}" for tok, tag in zip(row["tokens"], row["pos"])))
    print("\n" + "-"*80 + "\n")

# --- Find and print the top chunk per group ---
idxmax = df.groupby("group")["function_word_rel"].idxmax()
top_chunks = df.loc[idxmax]

for _, row in top_chunks.iterrows():
    print_chunk(row)


=== Group: Original Novels ===
Filename: Jinghua bi xue lu_chunk126  |  Chunk ID: 126
Tokens in chunk: 100  |  Function words: 30 (0.300)

— Chunk as continuous text (no spaces):
妄也我決不戕教且以力衛之聯軍諸將曰於何取證沈曰吾即證之以主教之身主教今日所以得生能訟我於聯軍審判之堂即我保衛之力我若不衛主教久已爲亂民所殺矣何能留此完全之軀命而訟我此即我所以衛主教也諸將鹹以爲然得不坐乃定廷穆死刑廷穆滿洲人恂恂孝友能書畫藏楊椒山手跡及 

— Chunk with spaces (for inspection):
妄 也 我 決 不 戕 教 且 以 力 衛 之 聯軍 諸 將 曰 於 何 取證 沈 曰 吾 即 證 之 以 主教 之 身 主教 今日 所以 得 生 能 訟 我 於 聯軍 審判 之 堂 即 我 保衛 之 力 我 若 不 衛 主教 久已 爲 亂民 所 殺 矣 何 能 留 此 完全 之 軀 命 而 訟 我 此 即 我 所以 衛 主教 也 諸將 鹹 以爲 然 得 不 坐 乃 定 廷 穆 死刑 廷 穆 滿洲 人 恂恂 孝友 能 書畫 藏 楊椒山 手跡 及 

— token/POS pairs:
妄/VV 也/SP 我/PN 決/AD 不/AD 戕/VV 教/NN 且/AD 以/P 力/NN 衛/VV 之/PN 聯軍/NN 諸/DT 將/NN 曰/VV 於/P 何/PN 取證/VV 沈/NR 曰/VV 吾/PN 即/AD 證/VV 之/PN 以/P 主教/NN 之/DEG 身/NN 主教/NN 今日/NT 所以/AD 得/VV 生/NN 能/VV 訟/VV 我/PN 於/P 聯軍/NN 審判/VV 之/DEC 堂/NN 即/VC 我/PN 保衛/VV 之/DEC 力/NN 我/PN 若/CS 不/AD 衛/VV 主教/NN 久已/AD 爲/LB 亂民/NN 所/MSP 殺/VV 矣/SP 何/AD 能/VV 留/VV 此/DT 完全/JJ 之/DEG 軀/NN 命/NN 而/MSP 訟/VV 我/PN 此/PN 即/AD 我/PN 所以/AD 衛/VV 主教/NN 也/SP 諸將/NN 鹹/AD 以

In [6]:
import re
import numpy as np
import pandas as pd

# -----------------------------
# 0) Load chunked data
# -----------------------------
# Expects columns: filename, group, tokens, pos, chunk_id, chunk_len
df = pd.read_json("chunked_ave100_token_blocks.json")

# Ensure token_count
if "token_count" not in df.columns:
    df["token_count"] = df["chunk_len"] if "chunk_len" in df.columns else df["tokens"].apply(len)

# -----------------------------
# 1) Chunk-level pronoun counts (POS tag = PN)
# -----------------------------
PRON_TAG = "PN"
df["pron_count"] = df["pos"].apply(lambda tags: sum(1 for t in tags if t == PRON_TAG))
df["pron_rel"]   = df["pron_count"] / df["token_count"]  # (for diagnostics)

# -----------------------------
# 2) Aggregate to PER-NOVEL (token-weighted pronoun rate)
#    pron_rate_weighted = total pronouns / total tokens in the novel
# -----------------------------
def base_id(name: str) -> str:
    """Strip trailing _chunkNN to recover the novel id."""
    name = str(name)
    m = re.search(r"(.*)_chunk\d+$", name)
    return m.group(1) if m else name

df["novel_id"] = df["filename"].apply(base_id)

per_novel = (
    df.groupby(["novel_id", "group"], as_index=False)
      .agg(
          n_chunks=("filename", "count"),
          sum_pron=("pron_count", "sum"),
          sum_tok=("token_count", "sum"),
          mean_of_chunks=("pron_rel", "mean")  # unweighted mean of chunk proportions (optional)
      )
)
per_novel["pron_rate_weighted"] = per_novel["sum_pron"] / per_novel["sum_tok"]

print("Per-novel (token-weighted) pronoun rate — first rows:")
print(per_novel.head())

# -----------------------------
# 3) Summarize by GROUP across novels (inference at novel level)
# -----------------------------
measure = "pron_rate_weighted"  # or "mean_of_chunks"

grp = per_novel.groupby("group")[measure]
novel_summary = grp.agg(
    n_texts="count",
    mean_rel="mean",
    median_rel="median",
    std_rel="std"
).reset_index()

# Add SE and 95% CI (t-based if SciPy available; else normal approx)
def add_ci_t_or_normal(summary: pd.DataFrame) -> pd.DataFrame:
    n = summary["n_texts"].to_numpy()
    se = summary["std_rel"].to_numpy() / np.sqrt(n)
    try:
        from scipy.stats import t
        tcrit = t.ppf(0.975, df=np.maximum(n - 1, 1))
        ci_low  = summary["mean_rel"].to_numpy() - tcrit * se
        ci_high = summary["mean_rel"].to_numpy() + tcrit * se
    except Exception:
        # Normal approx fallback
        z = 1.96
        ci_low  = summary["mean_rel"].to_numpy() - z * se
        ci_high = summary["mean_rel"].to_numpy() + z * se
    summary["se_rel"] = se
    summary["ci95_low"] = ci_low
    summary["ci95_high"] = ci_high
    return summary

novel_summary = add_ci_t_or_normal(novel_summary)

print("\nGroup-level summary (novel-level, token-weighted):")
print(novel_summary[["group","n_texts","mean_rel","median_rel","std_rel","se_rel","ci95_low","ci95_high"]])

# -----------------------------
# 4) Optional: test & effect size between two groups (novel level)
# -----------------------------
groups = {g: s[measure].values for g, s in per_novel.groupby("group")}
if len(groups) == 2:
    (g1, g2) = list(groups.keys())
    x, y = groups[g1], groups[g2]

    # Welch's t-test (does not assume equal variances)
    try:
        from scipy.stats import ttest_ind
        welch = ttest_ind(x, y, equal_var=False)
        print(f"\nWelch t-test on {measure}: t={welch.statistic:.3f}, p={welch.pvalue:.3g}")
    except Exception as e:
        print("\n[Welch t-test skipped: SciPy not available]", e)

    # Hedges' g (bias-corrected Cohen's d)
    sx, sy = np.std(x, ddof=1), np.std(y, ddof=1)
    nx, ny = len(x), len(y)
    sp = np.sqrt(((nx-1)*sx**2 + (ny-1)*sy**2) / max(nx+ny-2, 1))
    d = (np.mean(x) - np.mean(y)) / sp if sp > 0 else np.nan
    J = 1 - (3 / (4*(nx+ny) - 9)) if (nx+ny) > 9 else 1.0
    g_hedges = J * d
    print(f"Hedges' g ({g1} - {g2}) on {measure}: {g_hedges:.3f}")
else:
    print("\nTwo-sample test skipped (need exactly two groups).")

# -----------------------------
# 5) Optional: save outputs
# -----------------------------
# per_novel.to_csv("per_novel_pronoun_weighted.csv", index=False, encoding="utf-8-sig")
# novel_summary.to_csv("group_summary_novel_level_pronouns_weighted.csv", index=False, encoding="utf-8-sig")


Per-novel (token-weighted) pronoun rate — first rows:
                        novel_id              group  n_chunks  sum_pron  \
0           Aiji jinta poushi ji  Translated Novels       749      6536   
1          Aisilan qingxia zhuan  Translated Novels       577      4498   
2             Chan chao ji shang  Translated Novels       506      4178   
3                Chan chao ji xu  Translated Novels       469      3554   
4  Feizhou yanshui chou cheng lu  Translated Novels       758      5351   

   sum_tok  mean_of_chunks  pron_rate_weighted  
0    74854        0.087325            0.087317  
1    57743        0.077900            0.077897  
2    50637        0.082506            0.082509  
3    46934        0.075731            0.075723  
4    75769        0.070627            0.070623  

Group-level summary (novel-level, token-weighted):
               group  n_texts  mean_rel  median_rel   std_rel    se_rel  \
0    Original Novels        6  0.035480    0.031462  0.008421  0.003438   

In [10]:
import pandas as pd
from collections import Counter

# --- Load chunks (needs: filename, group, tokens, pos, chunk_id, chunk_len) ---
df = pd.read_json("chunked_ave100_token_blocks.json")

# --- Function-word POS tags (your exact set) ---
fw_tags = [
    "PN",
    
]

# --- Ensure token_count for normalization ---
if "token_count" not in df.columns:
    df["token_count"] = df["chunk_len"] if "chunk_len" in df.columns else df["tokens"].apply(len)

# --- Compute function-word relative frequency per chunk ---
df["function_word_count"] = df["pos"].apply(lambda tags: sum(1 for t in tags if t in fw_tags))
df["function_word_rel"] = df["function_word_count"] / df["token_count"]

# --- Helper to pretty-print a chunk ---
def print_chunk(row):
    print(f"=== Group: {row['group']} ===")
    print(f"Filename: {row['filename']}  |  Chunk ID: {row.get('chunk_id')}")
    print(f"Tokens in chunk: {row['token_count']}  |  Function words: {row['function_word_count']} "
          f"({row['function_word_rel']:.3f})\n")

    # Print the whole chunk as text (Chinese: no spaces) and with spaces (for inspection)
    text_no_space = "".join(row["tokens"])
    text_with_space = " ".join(row["tokens"])

    print("— Chunk as continuous text (no spaces):")
    print(text_no_space, "\n")
    print("— Chunk with spaces (for inspection):")
    print(text_with_space, "\n")

    # Print all token/POS pairs
    print("— token/POS pairs:")
    print(" ".join(f"{tok}/{tag}" for tok, tag in zip(row["tokens"], row["pos"])))
    print("\n" + "-"*80 + "\n")

# --- Find and print the top chunk per group ---
idxmax = df.groupby("group")["function_word_rel"].idxmax()
top_chunks = df.loc[idxmax]

for _, row in top_chunks.iterrows():
    print_chunk(row)


=== Group: Original Novels ===
Filename: Yuan hai ling guang_chunk52  |  Chunk ID: 52
Tokens in chunk: 100  |  Function words: 13 (0.130)

— Chunk as continuous text (no spaces):
也於是夫婦合詞再三請婦躍起曰爾何知今日人挾其僞義以我爲俎上肉恣其分啖所恃者外家有人爲平其曲直今爾夫婦亦爲之揚波而助瀾試問今日人利吾產即畀以產設更趣吾命爾亦坐聽吾死耶紛呶既久日且晡議仍未定匠氏告槥成且殮而嗣事猶懸伯亢聲曰吾固貧 

— Chunk with spaces (for inspection):
也 於是 夫婦 合 詞 再三 請 婦 躍起 曰 爾 何 知 今日 人 挾 其 僞 義 以 我 爲 俎 上 肉 恣 其 分 啖 所 恃 者 外家 有人 爲 平 其 曲直 今 爾 夫婦 亦 爲 之 揚 波 而 助 瀾 試問 今日 人 利 吾 產 即 畀 以 產 設 更 趣 吾 命 爾 亦 坐 聽 吾 死 耶 紛 呶 既 久 日 且 晡 議 仍未 定 匠 氏 告 槥 成 且 殮 而 嗣 事 猶 懸 伯 亢 聲 曰 吾 固 貧 

— token/POS pairs:
也/SP 於是/AD 夫婦/NN 合/VV 詞/NN 再三/AD 請/VV 婦/NN 躍起/VV 曰/VV 爾/PN 何/AD 知/VV 今日/NT 人/NN 挾/VV 其/PN 僞/JJ 義/NN 以/P 我/PN 爲/VV 俎/NN 上/LC 肉/NN 恣/VV 其/PN 分/VV 啖/VV 所/MSP 恃/VV 者/SP 外家/NN 有人/PN 爲/P 平/VV 其/PN 曲直/NN 今/NT 爾/PN 夫婦/NN 亦/AD 爲/P 之/PN 揚/VV 波/NN 而/MSP 助/VV 瀾/NN 試問/VV 今日/NT 人/NN 利/VV 吾/PN 產/NN 即/AD 畀/VV 以/P 產/NN 設/VV 更/AD 趣/NN 吾/PN 命/VV 爾/PN 亦/AD 坐/VV 聽/VV 吾/PN 死/NN 耶/SP 紛/AD 呶/VV 既/AD 久/VA 日/NN 且/AD 晡/VV 議/NN 仍未/AD 定/VV 匠/NR 氏/NN 告/VV 槥/VV 

In [12]:
# --- Print top K chunks per group (Translated vs Original) ---
K = 5

# If you know the exact group names, set them here; else auto-detect:
target_groups = ["Translated Novels", "Original Novels"]
available = set(df["group"].unique())
groups_to_use = [g for g in target_groups if g in available] or list(available)

for g in groups_to_use:
    sub = df[df["group"] == g]
    if sub.empty:
        continue
    topk = sub.nlargest(K, "function_word_rel", keep="all").head(K)
    print("\n" + "="*80)
    print(f"Top {len(topk)} chunks in group: {g}")
    print("="*80 + "\n")
    for _, row in topk.iterrows():
        print_chunk(row)



Top 5 chunks in group: Translated Novels

=== Group: Translated Novels ===
Filename: Aiji jinta poushi ji_chunk707  |  Chunk ID: 707
Tokens in chunk: 99  |  Function words: 24 (0.242)

— Chunk as continuous text (no spaces):
乎汝其侮我矣余曰格魯巴亞聽之汝臨命近矣此非吾侮殆大神譴汝汝相吾面誰耶汝以為吾黔黑之面跛蹩之何來者非愁怨所集何由至是汝亦知吾果為誰格魯巴亞注目而視狀如野人顫聲言曰吾知汝矣以神譴言之汝得毋為夏馬之夏馬之汝死久奈何得生意其爲厲弄我啣 

— Chunk with spaces (for inspection):
乎 汝 其 侮 我 矣 余 曰 格魯巴亞 聽 之 汝 臨 命 近 矣 此 非 吾 侮 殆 大神 譴 汝 汝 相 吾 面 誰 耶 汝 以為 吾 黔 黑 之 面 跛 蹩 之 何 來 者 非 愁 怨 所 集 何 由 至 是 汝 亦 知 吾 果 為 誰 格魯巴亞 注目 而 視 狀 如 野人 顫 聲 言 曰 吾 知 汝 矣 以 神 譴 言 之 汝 得 毋 為 夏馬 之 夏馬 之 汝 死 久 奈何 得 生意 其 爲 厲 弄 我 啣 

— token/POS pairs:
乎/SP 汝/PN 其/PN 侮/VV 我/PN 矣/SP 余/PN 曰/VV 格魯巴亞/NR 聽/VV 之/PN 汝/PN 臨/VV 命/NN 近/VA 矣/SP 此/PN 非/VC 吾/PN 侮/VV 殆/VV 大神/NN 譴/VV 汝/PN 汝/PN 相/VV 吾/PN 面/VV 誰/PN 耶/SP 汝/PN 以為/VV 吾/PN 黔/NR 黑/NR 之/DEG 面/NN 跛/VV 蹩/VV 之/SP 何/DT 來/VV 者/SP 非/AD 愁/NN 怨/NN 所/MSP 集/VV 何/AD 由/NN 至/VV 是/VC 汝/PN 亦/AD 知/VV 吾/PN 果/AD 為/VC 誰/PN 格魯巴亞/NR 注目/VV 而/MSP 視/VV 狀/NN 如/P 野人/NN 顫/VV 聲/NN 言/VV 曰/VV 吾/PN 知/VV 汝/PN 矣/SP 以/P 神

In [18]:
from collections import Counter
import pandas as pd

K = 5

# --- 只擷取 POS=PN 的代詞 ---
def extract_PN(tokens, tags):
    return [tok for tok, tag in zip(tokens, tags) if tag == "PN"]

# 若先前已建立 top_pron_chunks（每組前5個 chunk），直接使用；
# 否則依 POS=PN 計算每組前5個 chunk。
if 'top_pron_chunks' in globals():
    top5 = top_pron_chunks.copy()
else:
    if 'pron_PN_tokens' not in df.columns:
        df['pron_PN_tokens'] = df.apply(lambda r: extract_PN(r['tokens'], r['pos']), axis=1)
    df['pron_PN_total'] = df['pron_PN_tokens'].apply(len)

    # 依每組（如 "Translated Novels", "Original Novels"）取代詞最多的前 K 個 chunk
    top5 = (
        df.sort_values(['group', 'pron_PN_total'], ascending=[True, False])
          .groupby('group', group_keys=False)
          .head(K)
          .copy()
    )

# 確保有 PN 清單與總數欄位（以防 top5 來自他處）
if 'pron_PN_tokens' not in top5.columns:
    top5['pron_PN_tokens'] = top5.apply(lambda r: extract_PN(r['tokens'], r['pos']), axis=1)
if 'pron_PN_total' not in top5.columns:
    top5['pron_PN_total'] = top5['pron_PN_tokens'].apply(len)

# --- 逐 chunk 統計「每個 PN 代詞 token 的計數」並列印 ---
def print_pn_counts_for_chunk(row):
    gid = row.get('chunk_id')
    print(f"=== 組別: {row['group']} | 檔名: {row['filename']} | 分塊ID: {gid} ===")
    c = Counter(row['pron_PN_tokens'])
    print(f"PN 代詞總數: {row['pron_PN_total']}")
    if c:
        pairs = "  ".join(f"{tok}:{cnt}" for tok, cnt in c.most_common())
        print("各代詞計數（token:次數）:", pairs)
    else:
        print("（本分塊無 PN 代詞）")
    print("-" * 80)

for _, r in top5.iterrows():
    print_pn_counts_for_chunk(r)

# --- 同步建一個長表，便於後續分析或存檔 ---
rows = []
for _, r in top5.iterrows():
    c = Counter(r['pron_PN_tokens'])
    if c:
        for tok, cnt in c.items():
            rows.append({
                "group": r["group"],
                "filename": r["filename"],
                "chunk_id": r.get("chunk_id"),
                "pn_token": tok,
                "count": cnt
            })
    else:
        rows.append({
            "group": r["group"],
            "filename": r["filename"],
            "chunk_id": r.get("chunk_id"),
            "pn_token": None,
            "count": 0
        })

pn_counts_per_chunk = pd.DataFrame(rows).sort_values(
    ["group", "filename", "chunk_id", "count"], ascending=[True, True, True, False]
).reset_index(drop=True)

# 如需存檔：
# pn_counts_per_chunk.to_csv("top5_chunks_PN_代詞逐塊計數.csv", index=False, encoding="utf-8-sig")


=== 組別: Original Novels | 檔名: Yuan hai ling guang_chunk52 | 分塊ID: 52 ===
PN 代詞總數: 13
各代詞計數（token:次數）: 吾:4  爾:3  其:3  我:1  有人:1  之:1
--------------------------------------------------------------------------------
=== 組別: Original Novels | 檔名: Jinghua bi xue lu_chunk175 | 分塊ID: 175 ===
PN 代詞總數: 12
各代詞計數（token:次數）: 其:5  汝:3  爾:2  人家:1  吾:1
--------------------------------------------------------------------------------
=== 組別: Original Novels | 檔名: Jinguo Yangqiu_chunk134 | 分塊ID: 134 ===
PN 代詞總數: 12
各代詞計數（token:次數）: 吾:5  爾:4  汝:2  若:1
--------------------------------------------------------------------------------
=== 組別: Original Novels | 檔名: Yuan hai ling guang_chunk58 | 分塊ID: 58 ===
PN 代詞總數: 12
各代詞計數（token:次數）: 吾:4  之:2  爾:2  此:1  其:1  汝:1  餘:1
--------------------------------------------------------------------------------
=== 組別: Original Novels | 檔名: Yuan hai ling guang_chunk93 | 分塊ID: 93 ===
PN 代詞總數: 12
各代詞計數（token:次數）: 我:4  爾:2  吾:2  之:1  其:1  何時:1  有人:1
-------------------------