# WIQA causal triple pipeline test

本 Notebook 使用 Hugging Face 的 WIQA 数据集，串联本地 Ollama 生成三元组、语义排序（Top‑M 平均）、置信度融合筛选，以及最终的 more/less/no_effect 判定。


In [1]:
import sys, os, importlib, json
sys.path.append(os.path.abspath("01"))

from datasets import load_dataset
import ollama

import semantic_ranker, triple_ranker, triple_selector, effect_decider, causal_triple_generator
importlib.reload(semantic_ranker); importlib.reload(triple_ranker); importlib.reload(triple_selector); importlib.reload(effect_decider); importlib.reload(causal_triple_generator)

MODEL = "gemma2:27b"
CONFIDENCE_THRESHOLD = 0.7
SPLIT = "validation"  # 或 'train'
N_SAMPLES = 5
NUM_VARIATIONS = 10
TOP_M = 3
KEEP_FRACTION = 0.5
BACKEND = "auto"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("allenai/wiqa", split=SPLIT, trust_remote_code=True)
len(ds)


6894

In [3]:
def get_question(ex):
    # 兼容不同字段名
    for key in ["question", "question_stem", "query", "what_if", "question_text"]:
        if key in ex and ex[key]:
            q = ex[key]
            if isinstance(q, dict) and 'stem' in q:
                q = q['stem']
            return str(q)
    return ""

def get_label(ex):
    for key in ["answer_label", "label", "effect_label"]:
        if key in ex and ex[key] is not None:
            return str(ex[key]).strip().lower()
    return None

def normalize_label(lbl):
    mapping = {"no effect": "no_effect", "no_effect": "no_effect", "more": "more", "less": "less"}
    return mapping.get(lbl, None)


In [4]:
from causal_triple_generator import generate_causal_triples
from triple_ranker import rank_triples
from triple_selector import select_triples
from effect_decider import decide_effect

def run_pipeline(question):
    gen = generate_causal_triples(question, model=MODEL, confidence_threshold=CONFIDENCE_THRESHOLD)
    ranked = rank_triples(gen, question, num_variations=NUM_VARIATIONS, backend=BACKEND, top_m=TOP_M)
    selected = select_triples(ranked, keep_fraction=KEEP_FRACTION, weight_avg=0.7, weight_confidence=0.3)
    decision = decide_effect(question, ranked, target=None, weight_avg=0.7, weight_confidence=0.3)
    return {"generated": gen, "ranked": ranked, "selected": selected, "decision": decision}


In [5]:
results = []
for i in range(min(N_SAMPLES, len(ds))):
    ex = ds[i]
    q = get_question(ex)
    gold = normalize_label(get_label(ex))
    try:
        out = run_pipeline(q)
        results.append({"index": i, "question": q, "gold": gold, **out})
    except Exception as e:
        print(f"Pipeline failed on index {i}: {e}")

print(json.dumps([
    {"i": r["index"], "gold": r["gold"], "decision": r["decision"]["decision"]}
    for r in results
], ensure_ascii=False, indent=2))


[
  {
    "i": 0,
    "gold": "more",
    "decision": "more"
  },
  {
    "i": 1,
    "gold": "more",
    "decision": "less"
  },
  {
    "i": 2,
    "gold": "no_effect",
    "decision": "more"
  },
  {
    "i": 3,
    "gold": "less",
    "decision": "less"
  },
  {
    "i": 4,
    "gold": "no_effect",
    "decision": "less"
  }
]


In [7]:
# 展示第一个样例的详细信息
if results:
    r0 = results[1]
    print("Q:", r0["question"])
    print("Gold:", r0["gold"], "Decision:", r0["decision"]["decision"])
    print("Top ranked triples (avg_score):")
    for item in r0["ranked"][:5]:
        print(f"{item['avg_score']:.3f} {item['triple']} conf={item.get('confidence')}")


Q: suppose the female is sterile happens, how will it affect LESS rabbits.
Gold: more Decision: less
Top ranked triples (avg_score):
0.208 ('female sterility', 'reduces', 'fertility rate') conf=0.95
0.086 ('fertility rate', 'reduces', 'rabbit population') conf=0.85
