# Reflective Evaluation: WIQA (Meta + Self-Reflection)

This notebook runs reflective LLM predictors that:
- Parse question structure (meta or direct).
- Build a small causal context.
- Stage A: draft analysis (entity_effect + label_guess + rationale).
- Stage B: reflection with explicit meta inversion rules and evidence.
- Programmatic guardrail applies the meta inversion rule to enforce consistency.

It uses quiet predictors to keep logs clean during batch runs.

In [4]:
import os, json, random
import pandas as pd
from datetime import datetime

import importlib
import ollama

import question_parser, ego_expansion_builder
importlib.reload(question_parser)
importlib.reload(ego_expansion_builder)
from question_parser import QuestionParser
from ego_expansion_builder import EgoExpansionCausalBuilder

from llm_predictors_quiet import (
    predict_meta_informed_llm_reflective,
    predict_combined_context_llm_reflective,
)

MODEL = 'gemma2:27b'
MAX_EXPANSION_DEPTH = 2
MAX_NEIGHBORS_PER_SEED = 5
MAX_RELATIONS_PER_ENTITY = 5

PARSER = QuestionParser(model_name=MODEL, verbose=False)  # 禁用日志输出
BUILDER = EgoExpansionCausalBuilder(
    model_name=MODEL,
    max_neighbors_per_seed=MAX_NEIGHBORS_PER_SEED,
    max_expansion_depth=MAX_EXPANSION_DEPTH,
    max_relations_per_entity=MAX_RELATIONS_PER_ENTITY,
    verbose=False,  # 禁用日志输出
)
print('Config ready: quiet reflective predictors will be used.')

Config ready: quiet reflective predictors will be used.


In [5]:
# Load a local WIQA subset from wiqa_train_data.json (NDJSON).
def load_wiqa_local(path='wiqa_train_data.json', limit=10, seed=42):
    items = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                obj = json.loads(line)
            except Exception:
                continue
            q = obj.get('question_stem') or obj.get('question') or ''
            lbl = (obj.get('answer_label') or obj.get('label') or '').strip().lower()
            if q and lbl:
                items.append({'question': q, 'gold': lbl})
    random.Random(seed).shuffle(items)
    return items[:limit]

SAMPLES = load_wiqa_local(limit=20)
len(SAMPLES)


20

In [None]:
def norm(lbl):
    m = {
        'no effect': 'no_effect',
        'no_effect': 'no_effect',
        'more': 'more',
        'less': 'less'
    }
    return m.get((lbl or '').strip().lower())

# Create detailed log file
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = f'reflective_detailed_log_{ts}.txt'

rows = []
with open(log_file, 'w', encoding='utf-8') as log:
    log.write("="*80 + "\n")
    log.write("REFLECTIVE EVALUATION - DETAILED LOG\n")
    log.write("="*80 + "\n\n")
    
    for i, ex in enumerate(SAMPLES):
        q = ex['question']
        gold = norm(ex['gold'])
        if not gold:
            continue
        
        print(f"\n{'='*80}")
        print(f"Sample {i+1}/{len(SAMPLES)}")
        print(f"{'='*80}")
        print(f"Question: {q}")
        print(f"Gold Label: {gold}")
        
        # Run both reflective variants quietly
        r1 = predict_meta_informed_llm_reflective(q, PARSER, BUILDER, MODEL)
        r2 = predict_combined_context_llm_reflective(q, PARSER, BUILDER, MODEL)
        
        pred1 = norm(r1.get('final_answer'))
        pred2 = norm(r2.get('final_answer'))
        
        # Print concise reflection summary to console
        print("\n[Meta-Informed Reflective]")
        print(f"  Draft Label: {r1.get('draft_label')}")
        print(f"  Draft Rationale: {r1.get('rationale', 'N/A')[:100]}...")
        print(f"  → Computed Label: {r1.get('computed_label')}")
        print(f"  → Final Answer: {pred1}")
        if r1.get('corrected'):
            print(f"  ✓ CORRECTED (reason: {r1.get('correction_source')})")
        print(f"  Result: {'✓ CORRECT' if pred1 == gold else '✗ WRONG'}")
        
        print("\n[Combined-Context Reflective]")
        print(f"  Draft Label: {r2.get('draft_label')}")
        print(f"  Draft Rationale: {r2.get('rationale', 'N/A')[:100]}...")
        print(f"  → Computed Label: {r2.get('computed_label')}")
        print(f"  → Final Answer: {pred2}")
        if r2.get('corrected'):
            print(f"  ✓ CORRECTED (reason: {r2.get('correction_source')})")
        print(f"  Result: {'✓ CORRECT' if pred2 == gold else '✗ WRONG'}")
        
        # Write detailed information to log file
        log.write(f"\n{'='*80}\n")
        log.write(f"Sample {i+1} - Index: {i}\n")
        log.write(f"{'='*80}\n")
        log.write(f"Question: {q}\n")
        log.write(f"Gold Label: {gold}\n\n")
        
        log.write("----- Meta-Informed Reflective -----\n")
        log.write(f"Draft Label: {r1.get('draft_label')}\n")
        log.write(f"Draft Rationale: {r1.get('rationale')}\n")
        log.write(f"Computed Label: {r1.get('computed_label')}\n")
        log.write(f"Final Answer: {pred1}\n")
        log.write(f"Corrected: {r1.get('corrected')} ({r1.get('correction_source')})\n")
        log.write(f"Correct: {pred1 == gold}\n\n")
        log.write("Raw Analysis Response:\n")
        log.write(r1.get('raw', {}).get('analysis', 'N/A') + "\n\n")
        log.write("Raw Reflection Response:\n")
        log.write(r1.get('raw', {}).get('reflection', 'N/A') + "\n\n")
        
        log.write("----- Combined-Context Reflective -----\n")
        log.write(f"Draft Label: {r2.get('draft_label')}\n")
        log.write(f"Draft Rationale: {r2.get('rationale')}\n")
        log.write(f"Computed Label: {r2.get('computed_label')}\n")
        log.write(f"Final Answer: {pred2}\n")
        log.write(f"Corrected: {r2.get('corrected')} ({r2.get('correction_source')})\n")
        log.write(f"Correct: {pred2 == gold}\n\n")
        log.write("Raw Analysis Response:\n")
        log.write(r2.get('raw', {}).get('analysis', 'N/A') + "\n\n")
        log.write("Raw Reflection Response:\n")
        log.write(r2.get('raw', {}).get('reflection', 'N/A') + "\n\n")
        
        rows.append({
            'index': i,
            'question': q,
            'gold': gold,
            'meta_reflective': pred1,
            'combined_reflective': pred2,
            'meta_correct': pred1 == gold,
            'combined_correct': pred2 == gold,
            'meta_corrected': r1.get('corrected'),
            'combined_corrected': r2.get('corrected'),
            'meta_correction_source': r1.get('correction_source'),
            'combined_correction_source': r2.get('correction_source'),
        })

df = pd.DataFrame(rows)
meta_acc = (df['meta_correct'].sum() / len(df)) * 100 if len(df) else 0.0
comb_acc = (df['combined_correct'].sum() / len(df)) * 100 if len(df) else 0.0

print(f"\n{'='*80}")
print(f"FINAL SUMMARY")
print(f"{'='*80}")
print(f'Processed {len(df)} samples.')
print(f'Meta-Reflective Accuracy:     {meta_acc:.2f}%')
print(f'Combined-Reflective Accuracy: {comb_acc:.2f}%')
print(f'\nMeta corrections made: {df["meta_corrected"].sum()} / {len(df)}')
print(f'Combined corrections made: {df["combined_corrected"].sum()} / {len(df)}')
print(f'\nDetailed log saved to: {log_file}')



Sample 1/20
Question: suppose getting a storm over the coast from the ocean happens, how will it affect MORE erosion by the ocean.
Gold Label: more


In [None]:
# Save results with timestamp
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
out = f'reflective_results_{ts}.csv'
df.to_csv(out, index=False)
print('Saved:', out)
df.sample(min(len(df), 5))


In [None]:
# Ad-hoc tests: three custom cases provided by user
test_cases = [
    {
        'question': 'suppose the seedling is not eaten happens, how will it affect LESS trees?',
        'ground_truth': 'less',
        'description': 'Meta-level LESS question'
    },
    {
        'question': 'suppose less oil delivered happens, how will it affect more paper available?',
        'ground_truth': 'no_effect',
        'description': 'no_effect causal question'
    },
    {
        'question': 'ssuppose you inhale more air from the outside happens, how will it affect there will be less oxygen in your blood?',
        'ground_truth': 'less',
        'description': 'Meta-level MORE question'
    }
]

def norm(lbl):
    m = {
        'no effect': 'no_effect',
        'no_effect': 'no_effect',
        'more': 'more',
        'less': 'less'
    }
    return m.get((lbl or '').strip().lower())

rows = []
for i, case in enumerate(test_cases):
    q = case['question']
    gold = norm(case['ground_truth'])
    r1 = predict_meta_informed_llm_reflective(q, PARSER, BUILDER, MODEL)
    r2 = predict_combined_context_llm_reflective(q, PARSER, BUILDER, MODEL)
    pred1 = norm(r1.get('final_answer'))
    pred2 = norm(r2.get('final_answer'))
    rows.append({
        'idx': i,
        'description': case.get('description'),
        'question': q,
        'gold': gold,
        'meta_reflective': pred1,
        'combined_reflective': pred2,
        'meta_correct': pred1 == gold,
        'combined_correct': pred2 == gold,
        'meta_corrected': r1.get('corrected'),
        'combined_corrected': r2.get('corrected'),
    })

import pandas as pd
df_tc = pd.DataFrame(rows)
print(df_tc[['idx','description','gold','meta_reflective','combined_reflective','meta_correct','combined_correct']])
df_tc
