# Cladder v1 (`cladder-v1-q-easy.json`) 结构速览\n
\n
目标：快速查看 `Dataset/cladder-v1/cladder-v1-q-easy.json` 的顶层结构、字段、以及 `meta` / `reasoning` 的子结构。\n

In [1]:
from pathlib import Path
import json

rel_path = Path("Dataset/cladder-v1/cladder-v1-q-easy.json")
abs_path = Path(r"E:\\PHD\\01\\Dataset\\cladder-v1\\cladder-v1-q-easy.json")

path = rel_path if rel_path.exists() else abs_path
assert path.exists(), f"File not found: {path}"

print("File:", path)
print("Size (MB):", round(path.stat().st_size / (1024**2), 2))

with path.open("r", encoding="utf-8") as f:
    data = json.load(f)

print("Top-level type:", type(data).__name__)
print("Num records:", len(data))


File: Dataset\cladder-v1\cladder-v1-q-easy.json
Size (MB): 10.9
Top-level type: list
Num records: 10560


In [2]:
item0 = data[0]
list(item0.keys())


['question_id',
 'desc_id',
 'given_info',
 'question',
 'answer',
 'meta',
 'reasoning']

In [3]:
import json

print(json.dumps(item0, ensure_ascii=False, indent=2)[:3000])
print("...")


{
  "question_id": 1,
  "desc_id": "alarm-mediation-ate-model0-spec0-q1",
  "given_info": "For husbands that don't set the alarm, the probability of ringing alarm is 42%. For husbands that set the alarm, the probability of ringing alarm is 51%.",
  "question": "Will alarm set by husband decrease the chance of ringing alarm?",
  "answer": "no",
  "meta": {
    "story_id": "alarm",
    "graph_id": "mediation",
    "treated": true,
    "result": true,
    "polarity": false,
    "groundtruth": 0.09148280511411633,
    "query_type": "ate",
    "rung": 2,
    "formal_form": "E[Y | do(X = 1)] - E[Y | do(X = 0)]",
    "given_info": {
      "p(Y | X)": [
        0.4218874364506764,
        0.5133702415647927
      ]
    },
    "estimand": "P(Y=1|X=1) - P(Y=1|X=0)",
    "treatment": "X",
    "outcome": "Y",
    "model_id": 0
  },
  "reasoning": {
    "step0": "Let X = husband; V2 = wife; Y = alarm clock.",
    "step1": "X->V2,X->Y,V2->Y",
    "step2": "E[Y | do(X = 1)] - E[Y | do(X = 0)]",
    "

In [4]:
from collections import Counter

top_keys = Counter()
for x in data:
    top_keys.update(x.keys())

top_keys


Counter({'question_id': 10560,
         'desc_id': 10560,
         'given_info': 10560,
         'question': 10560,
         'answer': 10560,
         'meta': 10560,
         'reasoning': 10560})

In [5]:
# `meta` / `reasoning` 的子字段
meta_keys = Counter()
reasoning_keys = Counter()

for x in data:
    meta = x.get("meta")
    if isinstance(meta, dict):
        meta_keys.update(meta.keys())

    reasoning = x.get("reasoning")
    if isinstance(reasoning, dict):
        reasoning_keys.update(reasoning.keys())

print("meta keys:")
meta_keys


meta keys:


Counter({'story_id': 10560,
         'graph_id': 10560,
         'polarity': 10560,
         'groundtruth': 10560,
         'query_type': 10560,
         'rung': 10560,
         'formal_form': 10560,
         'given_info': 10560,
         'treatment': 10560,
         'outcome': 10560,
         'model_id': 10560,
         'treated': 6228,
         'estimand': 3984,
         'result': 3108,
         'flipped': 1644,
         'bad_candidate_set': 1644,
         'action': 1476,
         'mediators': 1212,
         'baseline': 336,
         'collider': 336})

In [6]:
print("reasoning keys:")
reasoning_keys


reasoning keys:


Counter({'step0': 8916,
         'step1': 8916,
         'step2': 8916,
         'step3': 8916,
         'step4': 8916,
         'step5': 8916,
         'end': 8916})

In [7]:
# 一些常用字段的取值分布（可按需增减）
query_type = Counter()
story_id = Counter()
graph_id = Counter()
answer = Counter()

for x in data:
    m = x.get("meta", {})
    if isinstance(m, dict):
        query_type[m.get("query_type")] += 1
        story_id[m.get("story_id")] += 1
        graph_id[m.get("graph_id")] += 1
    answer[x.get("answer")] += 1

query_type, answer


(Counter({'marginal': 1644,
          'backadj': 1644,
          'ate': 1476,
          'correlation': 1476,
          'det-counterfactual': 1476,
          'ett': 1296,
          'nie': 828,
          'nde': 384,
          'exp_away': 168,
          'collider_bias': 168}),
 Counter({'no': 5280, 'yes': 5280}))

In [8]:
# 递归摘要：用很浅的层级看看嵌套结构（抽样前 N 条，避免太慢）
from collections import defaultdict


def iter_paths(obj, prefix=()):
    if isinstance(obj, dict):
        yield prefix, dict
        for k, v in obj.items():
            yield from iter_paths(v, prefix + (str(k),))
        return
    if isinstance(obj, list):
        yield prefix, list
        # 只抽样前 2 个元素，避免路径爆炸
        for i, v in enumerate(obj[:2]):
            yield from iter_paths(v, prefix + (f"[{i}]",))
        return
    yield prefix, type(obj)


def summarize_schema(records, n=200):
    types_by_path = defaultdict(Counter)
    for x in records[:n]:
        for path, t in iter_paths(x):
            types_by_path[".".join(path) if path else "<root>"][t.__name__] += 1
    return types_by_path


schema = summarize_schema(data, n=200)
for k in sorted(schema.keys())[:80]:
    print(f"{k}: {dict(schema[k])}")
print("... (showing first 80 paths)")


<root>: {'dict': 200}
answer: {'str': 200}
desc_id: {'str': 200}
given_info: {'str': 200}
meta: {'dict': 200}
meta.bad_candidate_set: {'list': 39}
meta.bad_candidate_set.[0]: {'str': 33}
meta.estimand: {'str': 89}
meta.flipped: {'bool': 39}
meta.formal_form: {'str': 200}
meta.given_info: {'dict': 161, 'list': 39}
meta.given_info.P(X): {'float': 34}
meta.given_info.P(X=1): {'float': 38}
meta.given_info.P(Y | X): {'list': 34}
meta.given_info.P(Y | X).[0]: {'float': 34}
meta.given_info.P(Y | X).[1]: {'float': 34}
meta.given_info.P(Y=1, X=0): {'float': 38}
meta.given_info.P(Y=1, X=1): {'float': 38}
meta.given_info.[0]: {'list': 39}
meta.given_info.[0].[0]: {'str': 20}
meta.given_info.[1]: {'list': 39}
meta.given_info.[1].[0]: {'str': 19}
meta.given_info.p(V1): {'list': 6}
meta.given_info.p(V1).[0]: {'float': 6}
meta.given_info.p(V2 | X): {'list': 12}
meta.given_info.p(V2 | X).[0]: {'float': 12}
meta.given_info.p(V2 | X).[1]: {'float': 12}
meta.given_info.p(V3 | X): {'list': 8}
meta.given_i