# Single Question Pipeline Demo (WIQA)

This notebook runs the full causal triple pipeline on a single question from the WIQA dataset,
showing step-by-step how triples evolve: generation (Ollama) → ranking (Top‑M average) → selection (confidence fusion) → decision (more/less/no_effect).


In [7]:
import sys, os, importlib, json
sys.path.append(os.path.abspath('01'))

from datasets import load_dataset
import ollama

import semantic_ranker, triple_ranker, triple_selector, effect_decider, causal_triple_generator
importlib.reload(semantic_ranker); importlib.reload(triple_ranker); importlib.reload(triple_selector); importlib.reload(effect_decider); importlib.reload(causal_triple_generator)

# Configuration
MODEL = 'gemma2:27b'
CONFIDENCE_THRESHOLD = 0.7  # keep no_relation too if above threshold
SPLIT = 'validation'  # or 'train'
INDEX = 2  # choose which example to inspect
NUM_VARIATIONS = 10
TOP_M = 3  # Top-M mean per triple
KEEP_FRACTION = 0.5
BACKEND = 'auto'


In [8]:
ds = load_dataset('allenai/wiqa', split=SPLIT, trust_remote_code=True)
len(ds)


6894

In [9]:
def get_question(ex):
    for key in ['question', 'question_stem', 'query', 'what_if', 'question_text']:
        if key in ex and ex[key]:
            q = ex[key]
            if isinstance(q, dict) and 'stem' in q:
                q = q['stem']
            return str(q)
    return ''

def get_label(ex):
    for key in ['answer_label', 'label', 'effect_label']:
        if key in ex and ex[key] is not None:
            return str(ex[key]).strip().lower()
    return None

def normalize_label(lbl):
    mapping = {'no effect': 'no_effect', 'no_effect': 'no_effect', 'more': 'more', 'less': 'less'}
    return mapping.get(lbl, None)

ex = ds[INDEX]
question = get_question(ex)
gold = normalize_label(get_label(ex))
print('Question:', question)
print('Gold label:', gold)


Question: suppose there is no sunlight for the tree to grow happens, how will it affect LESS rain.
Gold label: no_effect


In [10]:
from causal_triple_generator import generate_causal_triples
gen = generate_causal_triples(question, model=MODEL, confidence_threshold=CONFIDENCE_THRESHOLD)
print('Entities:', gen.get('entities'))
print('Generated triples (kept by confidence):')
print(json.dumps(gen.get('triples', []), ensure_ascii=False, indent=2))
# Count no_relation
nr = [t for t in gen.get('triples', []) if str(t.get('relation','')).lower()=='no_relation']
print(f'no_relation count: {len(nr)}')


Entities: ['sunlight', 'photosynthesis', 'tree growth', 'water uptake', 'rain']
Generated triples (kept by confidence):
[
  {
    "head": "sunlight",
    "relation": "enables",
    "tail": "photosynthesis",
    "description": "Sunlight provides the energy for trees to perform photosynthesis.",
    "confidence": 0.95
  },
  {
    "head": "photosynthesis",
    "relation": "produces",
    "tail": "tree growth",
    "description": "Photosynthesis produces sugars that are used by trees for growth.",
    "confidence": 0.9
  },
  {
    "head": "tree growth",
    "relation": "increases",
    "tail": "water uptake",
    "description": "Larger trees with more leaves transpire more water.",
    "confidence": 0.85
  },
  {
    "head": "water uptake",
    "relation": "produces",
    "tail": "rain",
    "description": "Increased transpiration releases water vapor into the atmosphere, contributing to cloud formation and rainfall.",
    "confidence": 0.75
  }
]
no_relation count: 0


In [11]:
from triple_ranker import rank_triples
ranked = rank_triples(gen, question, num_variations=NUM_VARIATIONS, backend=BACKEND, top_m=TOP_M)
print('Ranked triples (by Top-M average score):')
for item in ranked[:10]:
    print('{:.3f}	{}	conf={}'.format(item['avg_score'], item['triple'], item.get('confidence')))


Ranked triples (by Top-M average score):
0.268	('sunlight', 'enables', 'photosynthesis')	conf=0.95
0.211	('photosynthesis', 'produces', 'tree growth')	conf=0.9
0.176	('water uptake', 'produces', 'rain')	conf=0.75
0.157	('tree growth', 'increases', 'water uptake')	conf=0.85


In [12]:
from triple_selector import select_triples
selected = select_triples(ranked, keep_fraction=KEEP_FRACTION, weight_avg=0.7, weight_confidence=0.3)
print('Selected triples (combined score):')
for item in selected:
    print('{:.3f}	{}	avg={:.3f}	conf={}'.format(item['combined_score'], item['triple'], item['avg_score'], item.get('confidence')))


Selected triples (combined score):
0.473	('sunlight', 'enables', 'photosynthesis')	avg=0.268	conf=0.95
0.417	('photosynthesis', 'produces', 'tree growth')	avg=0.211	conf=0.9


In [13]:
from effect_decider import decide_effect
decision = decide_effect(question, ranked, target=None, weight_avg=0.7, weight_confidence=0.3)
print('Decision:', decision['decision'])
print(json.dumps(decision, ensure_ascii=False, indent=2))


Decision: more
{
  "decision": "more",
  "pos_score": 1.6031803656712351,
  "neg_score": 0.0,
  "no_rel_score": 0.0,
  "used_target": null,
  "supports": {
    "positive": [
      {
        "triple": [
          "sunlight",
          "enables",
          "photosynthesis"
        ],
        "avg_score": 0.2681228570620558,
        "confidence": 0.95,
        "combined_score": 0.472685999943439
      },
      {
        "triple": [
          "photosynthesis",
          "produces",
          "tree growth"
        ],
        "avg_score": 0.21055919281697708,
        "confidence": 0.9,
        "combined_score": 0.41739143497188397
      },
      {
        "triple": [
          "tree growth",
          "increases",
          "water uptake"
        ],
        "avg_score": 0.1565451060690678,
        "confidence": 0.85,
        "combined_score": 0.3645815742483475
      }
    ],
    "negative": [],
    "neutral": []
  }
}


In [14]:
# Build a progression view of triples across stages
def triple_key(t):
    return tuple(map(str, t))

gen_map = { triple_key((t.get('head'), t.get('relation'), t.get('tail'))): t for t in gen.get('triples', []) }
rank_map = { triple_key(item['triple']): item for item in ranked }
sel_map = { triple_key(item['triple']): item for item in selected }

rows = []
for k in gen_map.keys():
    g = gen_map.get(k, {})
    r = rank_map.get(k)
    s = sel_map.get(k)
    rows.append({
        'triple': k,
        'gen_confidence': g.get('confidence'),
        'rank_avg_score': None if r is None else r.get('avg_score'),
        'selected': s is not None,
        'combined_score': None if s is None else s.get('combined_score'),
    })

# Sort by selected first, then combined_score / avg_score
rows.sort(key=lambda x: (not x['selected'], -(x['combined_score'] or x['rank_avg_score'] or 0.0)))
print(json.dumps(rows[:20], ensure_ascii=False, indent=2))


[
  {
    "triple": [
      "sunlight",
      "enables",
      "photosynthesis"
    ],
    "gen_confidence": 0.95,
    "rank_avg_score": 0.2681228570620558,
    "selected": true,
    "combined_score": 0.472685999943439
  },
  {
    "triple": [
      "photosynthesis",
      "produces",
      "tree growth"
    ],
    "gen_confidence": 0.9,
    "rank_avg_score": 0.21055919281697708,
    "selected": true,
    "combined_score": 0.41739143497188397
  },
  {
    "triple": [
      "water uptake",
      "produces",
      "rain"
    ],
    "gen_confidence": 0.75,
    "rank_avg_score": 0.17645908072509223,
    "selected": false,
    "combined_score": null
  },
  {
    "triple": [
      "tree growth",
      "increases",
      "water uptake"
    ],
    "gen_confidence": 0.85,
    "rank_avg_score": 0.1565451060690678,
    "selected": false,
    "combined_score": null
  }
]


In [15]:
# Baseline: Direct LLM Effect Prediction (Without Triple Generation)
print("\n" + "="*80)
print("BASELINE: Direct LLM Effect Prediction (No Triple Pipeline)")
print("="*80 + "\n")

baseline_prompt = f"""Based on the following question, answer question

Question: {question}

Return ONLY the label in one of these formats:
- more
- less
- no_effect
"""

response = ollama.generate(model=MODEL, prompt=baseline_prompt)
baseline_response = response.get("response", "").strip().lower()

# Parse and normalize the baseline prediction
baseline_label = normalize_label(baseline_response)
if baseline_label is None:
    # Try to extract from common patterns if format is off
    if "more" in baseline_response:
        baseline_label = "more"
    elif "less" in baseline_response:
        baseline_label = "less"
    elif "no" in baseline_response and "effect" in baseline_response:
        baseline_label = "no_effect"
    else:
        baseline_label = "uncertain"

print(f"Baseline Prediction: {baseline_label}")
print(f"Gold Label:          {gold}")
print(f"Match: {'✓ CORRECT' if baseline_label == gold else '✗ INCORRECT'}")


BASELINE: Direct LLM Effect Prediction (No Triple Pipeline)

Baseline Prediction: less
Gold Label:          no_effect
Match: ✗ INCORRECT
