In [None]:
import json, math
import pandas as pd
import numpy as np
from collections import Counter

# Load your JSON
with open("all_Lin_Shu_token_pos_ner_by_subfolder.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# --- Functions ---
def ttr(tokens):
    return len(set(tokens)) / len(tokens) if tokens else np.nan

def root_ttr(tokens):
    return len(set(tokens)) / math.sqrt(len(tokens)) if tokens else np.nan

def mattr(tokens, window_size=500):
    n = len(tokens)
    if n == 0:
        return np.nan
    if n < window_size:
        return len(set(tokens)) / n
    counter = Counter(tokens[:window_size])
    uniq_counts = [len(counter)]
    for i in range(window_size, n):
        out_tok = tokens[i - window_size]
        in_tok = tokens[i]
        counter[out_tok] -= 1
        if counter[out_tok] == 0:
            del counter[out_tok]
        counter[in_tok] += 1
        uniq_counts.append(len(counter))
    return np.mean([uc / window_size for uc in uniq_counts])

# --- Compute metrics ---
results = []
for _, row in df.iterrows():
    tokens = row["tokens"]
    results.append({
        "filename": row["filename"],
        "group": row["group"],
        "n_tokens": len(tokens),
        "TTR": ttr(tokens),
        "Root_TTR": root_ttr(tokens),
        "MATTR_300": mattr(tokens, 300),
        "MATTR_500": mattr(tokens, 500),
        "MATTR_800": mattr(tokens, 800)
    })

metrics_df = pd.DataFrame(results)

# --- Aggregate by group ---
agg_df = (metrics_df
          .groupby("group")
          .agg({"n_tokens": ["count","mean","median"],
                "TTR": "mean",
                "Root_TTR": "mean",
                "MATTR_300": "mean",
                "MATTR_500": "mean",
                "MATTR_800": "mean"})
          .reset_index())

# Save outputs
metrics_df.to_csv("linshu_mattr_metrics_per_doc.csv", index=False)
agg_df.to_csv("linshu_mattr_metrics_by_group.csv", index=False)

print("Per-document metrics saved to linshu_mattr_metrics_per_doc.csv")
print("By-group averages saved to linshu_mattr_metrics_by_group.csv")
