In [2]:
import sys, os, importlib, json
from tqdm import tqdm
import pandas as pd
from datetime import datetime

sys.path.append(os.path.abspath('01'))

from datasets import load_dataset
import ollama

import semantic_ranker, triple_ranker, triple_selector, effect_decider, ego_expansion_builder
importlib.reload(semantic_ranker)
importlib.reload(triple_ranker)
importlib.reload(triple_selector)
importlib.reload(effect_decider)
importlib.reload(ego_expansion_builder)
from ego_expansion_builder import EgoExpansionCausalBuilder

# Configuration
MODEL = 'gemma2:27b'
CONFIDENCE_THRESHOLD = 0.1
SPLIT = 'validation'
NUM_VARIATIONS = 10
TOP_M = 5
KEEP_FRACTION = 0.5
BACKEND = 'auto'

# EgoExpansionCausalBuilder settings
MAX_EXPANSION_DEPTH = 3
MAX_NEIGHBORS_PER_SEED = 6
MAX_RELATIONS_PER_ENTITY = 6

# Instantiate EgoExpansionCausalBuilder
BUILDER = EgoExpansionCausalBuilder(
    model_name=MODEL,
    max_neighbors_per_seed=MAX_NEIGHBORS_PER_SEED,
    max_expansion_depth=MAX_EXPANSION_DEPTH,
    max_relations_per_entity=MAX_RELATIONS_PER_ENTITY,
    ascii_logs=True,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load dataset
ds = load_dataset('allenai/wiqa', split=SPLIT, trust_remote_code=True)
ex = ds[1]

def get_question(ex):
    for key in ['question', 'question_stem', 'query', 'what_if', 'question_text']:
        if key in ex and ex[key]:
            q = ex[key]
            if isinstance(q, dict) and 'stem' in q:
                q = q['stem']
            return str(q)
    return ''

question = get_question(ex)

In [4]:
# Helper functions

def get_label(ex):
    for key in ['answer_label', 'label', 'effect_label']:
        if key in ex and ex[key] is not None:
            return str(ex[key]).strip().lower()
    return None

def normalize_label(lbl):
    if lbl is None:
        return None
    mapping = {
        'no effect': 'no_effect',
        'no_effect': 'no_effect',
        'more': 'more',
        'less': 'less'
    }
    return mapping.get(str(lbl).strip().lower(), None)

print("Helper functions defined.")

Helper functions defined.


In [5]:
# Method 1: Baseline - Direct LLM prediction
def predict_baseline(question, model=MODEL):
    """Direct LLM prediction without causal triple pipeline."""
    try:
        baseline_prompt = f"""Based on the following question, directly predict whether the effect is MORE, LESS, or NO_EFFECT.

Question: {question}

Consider:
1. What is the initial state/change mentioned?
2. What is the outcome/effect being asked about?
3. Does logic or common sense suggest the outcome increases (MORE), decreases (LESS), or stays the same (NO_EFFECT)?

Return ONLY the label in one of these formats:
- more
- less
- no_effect

Do NOT provide explanation, just the label:"""
        
        response = ollama.generate(model=model, prompt=baseline_prompt)
        baseline_response = response.get("response", "").strip().lower()
        
        # Parse and normalize
        baseline_label = normalize_label(baseline_response)
        if baseline_label is None:
            # Fallback pattern matching
            if "more" in baseline_response:
                baseline_label = "more"
            elif "less" in baseline_response:
                baseline_label = "less"
            elif "no" in baseline_response and "effect" in baseline_response:
                baseline_label = "no_effect"
            else:
                baseline_label = "uncertain"
        
        return baseline_label
    except Exception as e:
        print(f"Baseline prediction error: {e}")
        return "error"

print("Baseline prediction function defined.")

Baseline prediction function defined.


In [6]:
question

'suppose the female is sterile happens, how will it affect LESS rabbits.'

In [25]:
builder_result = BUILDER.build_causal_chain(question)

[13:36:24] 
[13:36:24] STEP 1: Extract Seed Entities
[13:36:27] 
LLM Response:
female
rabbits
[13:36:27] 
 Found 2 seeds: {'rabbits', 'female'}
[13:36:27] 
[13:36:27] EXPANSION DEPTH 1/3
[13:36:27] Expanding relations for: rabbits
[13:36:27] Context entities: rabbits, female
[13:36:36] LLM Response:
[
  {
    "head": "female",
    "relation": "produces",
    "tail": "eggs",
    "description": "Female rabbits produce eggs as part of their reproductive cycle.",
    "confidence": 0.95
  },
  {
    "head": "rabbits",
    "relation": "triggers",
    "tail": "mating_behavior",
    "description": "The presence of rabbits, particularly during mating season, triggers specific behaviors aimed at reproduction.",
    "confidence": 0.87
  },
  {
    "head": "mating_behavior",
    "relation": "increases"...
[13:36:36] Generated 3 relations (novel_entities=3)
[13:36:36]    Added: female --[produces]--> eggs
[13:36:36]      Description: Female rabbits produce eggs as part of their reproductive cycle.


In [26]:
BUILDER.extract_seeds(question)

[13:39:53] 
[13:39:53] STEP 1: Extract Seed Entities
[13:39:54] 
LLM Response:
female
rabbits
[13:39:54] 
 Found 2 seeds: {'rabbits', 'female'}


{'female', 'rabbits'}

In [27]:
BUILDER.expand_causal_relations(BUILDER.extract_seeds(question),BUILDER.extract_seeds(question),[])

[13:39:56] 
[13:39:56] STEP 1: Extract Seed Entities
[13:39:56] 
LLM Response:
female
rabbits
[13:39:56] 
 Found 2 seeds: {'rabbits', 'female'}
[13:39:56] 
[13:39:56] STEP 1: Extract Seed Entities
[13:39:57] 
LLM Response:
female
rabbits
[13:39:57] 
 Found 2 seeds: {'rabbits', 'female'}
[13:39:57] Expanding relations for: {'rabbits', 'female'}
[13:39:57] Context entities: rabbits, female
[13:40:03] LLM Response:
[
  {
    "head": "{'rabbits', 'female'}",
    "relation": "produces",
    "tail": "estrogen", 
    "description": "Female rabbits produce the hormone estrogen.",
    "confidence": 0.95
  },
  {
    "head": "estrogen",
    "relation": "increases",
    "tail": "uterine lining thickness",
    "description": "Estrogen promotes the growth and thickening of the uterine lining in preparation for pregnancy.",
    "confidence": 0.88
  },
  {
    "head": "{'rabbits', 'female'}",
    "relation": "triggers...
[13:40:12] Fallback LLM Response:
```json
[
  {
    "head": "{rabbits, female}",

[]

In [28]:
builder_result

{'seeds': {'female', 'rabbits'},
 'entities': {'available_energy',
  'body_weight',
  'clutch_size',
  'copulation_frequency',
  'courtship_displays',
  'egg_activation_signal',
  'egg_development_rate',
  'eggs',
  'energy_expenditure_during_mating',
  'energy_stores',
  'female',
  'female-hormone-release',
  'female-receptivity',
  'female-receptivity_hormone_levels',
  'female_receptivity',
  'female_receptivity_hormone_levels',
  'fertilization_rate',
  'follicle development rate',
  'follicle_development_rate_increase_signal',
  'follicle_stimulating_hormone_release',
  'food_consumption',
  'hormones',
  'increased_muscle_mass',
  'increased_uterine_blood_flow',
  'libido',
  'male hormone levels',
  'male-courtship-intensity',
  'male_courtship_intensity',
  'mating_behavior',
  'mating_behavior_frequency',
  'muscle_protein_synthesis_rate',
  'offspring_survival_rate',
  'pH_change_in_female_tract',
  'pair-bonding-strength',
  'pair_bond_formation_rate',
  'prostate_fluid_vol

In [7]:
# Import the question parser
import question_parser
importlib.reload(question_parser)
from question_parser import QuestionParser

# Initialize parser
parser = QuestionParser(model_name=MODEL)

# Parse the question structure
question_structure = parser.parse_question_structure(question)
print("Question Structure Analysis:")
print(json.dumps(question_structure, indent=2))

Question Structure Analysis:
{
  "is_meta_level": true,
  "intervention": "female is sterile happens",
  "target_phrase": "LESS rabbits",
  "target_direction": "less",
  "target_entity": "rabbits",
  "question_type": "meta"
}


In [9]:
# Complete pipeline with meta-level reasoning

def predict_with_meta_reasoning(question, builder, parser, model=MODEL):
    """
    Complete pipeline with meta-level reasoning.

    Steps:
    1. Parse question structure (detect meta-level)
    2. Build causal graph
    3. Decide effect from causal triples (simplified)
    4. If meta-level, map/invert according to rules
    """
    # Step 1: Parse Question Structure
    print("="*60)
    print("STEP 1: Parse Question Structure")
    print("="*60)
    question_structure = parser.parse_question_structure(question)
    print(f"Question type: {question_structure['question_type']}")
    print(f"Is meta-level: {question_structure['is_meta_level']}")
    if question_structure['is_meta_level']:
        print(f"Target phrase: {question_structure['target_phrase']}")
        print(f"Target direction: {question_structure['target_direction']}")
    print()

    # Step 2: Build Causal Graph
    print("="*60)
    print("STEP 2: Build Causal Graph")
    print("="*60)
    builder_result = builder.build_causal_chain(question)
    print()

    # Step 3: Get triples and decide (simplified; use your ranking + deciding pipeline in practice)
    # For demo, we infer directly from the causal chain

    # Use semantic ranking and effect deciding
    triples = builder.get_all_triples(builder_result, format="structured")

    # Here we should call triple_ranker and effect_decider
    # For demo, we simplify to directly analyze causal relations
    print("="*60)
    print("STEP 3: Analyze Causal Relations")
    print("="*60)

    # Simplified: count positive/negative relations
    from effect_decider import POSITIVE_RELS, NEGATIVE_RELS
    pos_count = sum(1 for t in triples if t['triple'][1] in POSITIVE_RELS)
    neg_count = sum(1 for t in triples if t['triple'][1] in NEGATIVE_RELS)

    if pos_count > neg_count:
        causal_decision = "more"
    elif neg_count > pos_count:
        causal_decision = "less"
    else:
        causal_decision = "no_effect"

    print(f"Causal chain decision: {causal_decision}")
    print(f"  Positive relations: {pos_count}")
    print(f"  Negative relations: {neg_count}")
    print()

    # Step 4: Meta-Level Reasoning
    print("="*60)
    print("STEP 4: Meta-Level Reasoning")
    print("="*60)
    should_invert, final_answer = parser.should_invert_answer(
        question_structure, causal_decision
    )

    print(f"Should invert answer: {should_invert}")
    print(f"Final answer: {final_answer}")
    print()

    # Step 5: Explanation
    print("="*60)
    print("STEP 5: Explanation")
    print("="*60)
    explanation = parser.explain_reasoning(
        question_structure, causal_decision, final_answer
    )
    print(explanation)

    return {
        'question_structure': question_structure,
        'causal_decision': causal_decision,
        'should_invert': should_invert,
        'final_answer': final_answer,
        'explanation': explanation,
        'builder_result': builder_result
    }


In [10]:
# Test different question types using full pipeline

test_questions = [
    {
        'question': 'suppose the seedling is not eaten happens, how will it affect LESS trees.',
        'ground_truth': 'less'
    },
    {
        'question': 'suppose less oil delivered happens, how will it affect more paper available.',
        'ground_truth': 'no_effect'
    },
    {
        'question': 'suppose you inhale more air from the outside happens, how will it affect there will be less oxygen in your blood.',
        'ground_truth': 'less'
    }
]

print("=" * 80)
print("TESTING FULL PIPELINE (predict_with_meta_reasoning)")
print("=" * 80)

for i, test in enumerate(test_questions, 1):
    question = test['question']
    expected = test['ground_truth']

    print(f"\n{'='*80}")
    print(f"TEST CASE {i}")
    print(f"{'='*80}")
    print(f"Question: {question}\n")

    result = predict_with_meta_reasoning(question, BUILDER, parser, MODEL)

    print(f"Final answer: {result['final_answer']}")
    print(f"Expected answer: {expected}")
    print("CORRECT" if result['final_answer'] == expected else "WRONG")

print(f"\n{'='*80}")


TESTING FULL PIPELINE (predict_with_meta_reasoning)

TEST CASE 1
Question: suppose the seedling is not eaten happens, how will it affect LESS trees.

STEP 1: Parse Question Structure
Question type: meta
Is meta-level: True
Target phrase: LESS trees
Target direction: less

STEP 2: Build Causal Graph
[16:45:27] 
[16:45:27] STEP 1: Extract Seed Entities
[16:45:27] 
LLM Response:
seedling
trees
[16:45:27] 
 Found 2 seeds: {'trees', 'seedling'}
[16:45:27] 
[16:45:27] EXPANSION DEPTH 1/3
[16:45:27] Expanding relations for: trees
[16:45:27] Context entities: trees, seedling
[16:45:36] LLM Response:
```json
[
  {
    "head": "trees",
    "relation": "produces",
    "tail": "seeds",
    "description": "Mature trees produce seeds through sexual reproduction.",
    "confidence": 0.95
  },
  {
    "head": "seeds",
    "relation": "causes",
    "tail": "seedling",
    "description": "Under favorable conditions, seeds germinate and grow into seedlings.",
    "confidence": 0.88
  },
  {
    "head": "

In [11]:
# Meta inversion fix + debug helpers
import types

def _fixed_should_invert(self, question_structure, causal_decision):
    """Corrected meta-level mapping:
    - direct: return causal_decision
    - meta + target=LESS Y:
        * causal=less -> invert to more (the 'LESS Y' phenomenon increases)
        * causal=more -> final less (the 'LESS Y' phenomenon decreases)
    - meta + target=MORE Y: no inversion; final equals causal
    - any no_effect: return no_effect
    """
    if not question_structure.get('is_meta_level'):
        return False, causal_decision

    tgt = question_structure.get('target_direction')
    if causal_decision == 'no_effect':
        return False, 'no_effect'

    if tgt == 'less':
        if causal_decision == 'less':
            return True, 'more'
        elif causal_decision == 'more':
            return False, 'less'
        else:
            return False, 'no_effect'

    if tgt == 'more':
        if causal_decision in ('more','less'):
            return False, causal_decision
        return False, 'no_effect'

    return False, causal_decision

def debug_meta_mapping(question_structure, causal_decision):
    tgt = question_structure.get('target_direction')
    rule = None
    if not question_structure.get('is_meta_level'):
        rule = 'direct: no inversion'
    elif causal_decision == 'no_effect':
        rule = 'no_effect short-circuit'
    elif tgt == 'less' and causal_decision == 'less':
        rule = 'LESS target + less causal => invert to MORE (phenomenon increases)'
    elif tgt == 'less' and causal_decision == 'more':
        rule = 'LESS target + more causal => final LESS (phenomenon decreases)'
    elif tgt == 'more':
        rule = 'MORE target => no inversion; final = causal'
    else:
        rule = 'fallback'
    inv, ans = _fixed_should_invert(parser, question_structure, causal_decision)
    return {
        'is_meta_level': question_structure.get('is_meta_level'),
        'target_direction': tgt,
        'causal_decision': causal_decision,
        'applied_rule': rule,
        'should_invert': inv,
        'final_answer': ans,
    }

# Monkey-patch the parser instance and class so downstream calls use the corrected logic
parser.should_invert_answer = types.MethodType(_fixed_should_invert, parser)
QuestionParser.should_invert_answer = _fixed_should_invert

print('Meta inversion logic patched: LESS-target special handling activated.')


Meta inversion logic patched: LESS-target special handling activated.


In [12]:
# Method 1: Meta-Informed LLM Decision
# 在 Question Structure Analysis 之后，把 intervention、target_phrase、target_direction 等传给 LLM
def predict_meta_informed_llm(question, parser, builder, model=MODEL):
    """
    Method 1: After parsing question structure, pass all meta information to LLM
    and ask it to directly output the final label (more/less/no_effect) with rationale.
    """
    print("="*60)
    print("METHOD 1: Meta-Informed LLM Decision")
    print("="*60)
    
    # Step 1: Parse question structure
    question_structure = parser.parse_question_structure(question)
    
    print("\nQuestion Structure:")
    print(json.dumps(question_structure, indent=2))
    
    # Step 2: Build causal context (get some triples for context)
    builder_result = builder.build_causal_chain(question)
    triples = builder.get_all_triples(builder_result, format="structured")
    
    # Extract top few triples as context
    top_triples = triples[:5] if len(triples) >= 5 else triples
    triple_context = "\n".join([
        f"  - {t['triple'][0]} {t['triple'][1]} {t['triple'][2]} (confidence: {t.get('confidence', 0):.2f})"
        for t in top_triples
    ])
    
    # Step 3: Construct meta-informed prompt
    meta_prompt = f"""You are analyzing a causal reasoning question. Here is the parsed structure:

Original Question: {question}

Question Analysis:
- Is meta-level question: {question_structure.get('is_meta_level')}
- Intervention: {question_structure.get('intervention')}
- Target phrase: {question_structure.get('target_phrase')}
- Target direction: {question_structure.get('target_direction')}
- Target entity: {question_structure.get('target_entity')}
- Question type: {question_structure.get('question_type')}

Relevant Causal Relations:
{triple_context if triple_context else "  (No causal relations found)"}

Task: Based on the question structure and causal relations, determine the FINAL ANSWER.

Key reasoning steps:
1. If this is a meta-level question (asking about "MORE X" or "LESS X"), you need to reason about the phenomenon itself:
   - "LESS rabbits" means the phenomenon of having fewer rabbits
   - If intervention causes rabbits to decrease, then "LESS rabbits" increases (answer: more)
   - If intervention causes rabbits to increase, then "LESS rabbits" decreases (answer: less)

2. If this is a direct question, just determine the direct causal effect.

3. Consider the causal relations provided as context.

Return your answer in JSON format:
{{
  "final_answer": "more|less|no_effect",
  "rationale": "Brief explanation of your reasoning (2-3 sentences)"
}}

IMPORTANT: Return ONLY valid JSON, nothing else."""

    try:
        response = ollama.generate(model=model, prompt=meta_prompt)
        llm_response = response.get("response", "").strip()
        
        print("\n" + "="*60)
        print("LLM Response:")
        print("="*60)
        print(llm_response)
        
        # Parse JSON response
        # Try to extract JSON from response
        import re
        json_match = re.search(r'\{[^}]+\}', llm_response, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group())
            final_answer = normalize_label(result.get('final_answer', ''))
            rationale = result.get('rationale', '')
        else:
            # Fallback parsing
            final_answer = None
            rationale = llm_response
            for label in ['more', 'less', 'no_effect']:
                if label in llm_response.lower():
                    final_answer = label
                    break
        
        print("\n" + "="*60)
        print(f"Final Answer: {final_answer}")
        print(f"Rationale: {rationale}")
        print("="*60)
        
        return {
            'method': 'meta_informed_llm',
            'question_structure': question_structure,
            'final_answer': final_answer,
            'rationale': rationale,
            'raw_response': llm_response
        }
        
    except Exception as e:
        print(f"Error in meta-informed LLM prediction: {e}")
        return {
            'method': 'meta_informed_llm',
            'question_structure': question_structure,
            'final_answer': 'error',
            'rationale': str(e)
        }


# Method 2: Combined Context LLM Decision
# 直接把原问题和元推理信息合并，让 LLM 一步给出答案
def predict_combined_context_llm(question, parser, builder, model=MODEL):
    """
    Method 2: Merge original question with meta reasoning context,
    let LLM give answer in one shot with full explanation.
    """
    print("="*60)
    print("METHOD 2: Combined Context LLM Decision")
    print("="*60)
    
    # Step 1: Parse question structure
    question_structure = parser.parse_question_structure(question)
    
    # Step 2: Build causal context
    builder_result = builder.build_causal_chain(question)
    triples = builder.get_all_triples(builder_result, format="structured")
    
    # Extract causal chain summary
    from effect_decider import POSITIVE_RELS, NEGATIVE_RELS
    pos_rels = [t for t in triples if t['triple'][1] in POSITIVE_RELS]
    neg_rels = [t for t in triples if t['triple'][1] in NEGATIVE_RELS]
    
    pos_context = "\n".join([
        f"  - {t['triple'][0]} → {t['triple'][2]} ({t['triple'][1]})"
        for t in pos_rels[:3]
    ]) if pos_rels else "  (none)"
    
    neg_context = "\n".join([
        f"  - {t['triple'][0]} → {t['triple'][2]} ({t['triple'][1]})"
        for t in neg_rels[:3]
    ]) if neg_rels else "  (none)"
    
    # Construct combined prompt with full context
    combined_prompt = f"""You are a causal reasoning expert. Analyze the following question and determine the final answer.

ORIGINAL QUESTION:
{question}

CONTEXT INFORMATION:

1. Question Type: {"Meta-level (asking about MORE/LESS phenomenon)" if question_structure.get('is_meta_level') else "Direct causal question"}

2. Key Elements:
   - Intervention/Change: {question_structure.get('intervention', 'N/A')}
   - Target Being Asked About: {question_structure.get('target_phrase', 'N/A')}
   - Direction Mentioned: {question_structure.get('target_direction', 'N/A')}

3. Causal Relations Found:
   
   Positive/Increasing Relations (cause increase):
{pos_context}
   
   Negative/Decreasing Relations (cause decrease):
{neg_context}

REASONING GUIDE:

If this is a META-LEVEL question (asking about "LESS X" or "MORE X"):
- You are reasoning about the PHENOMENON itself, not the entity
- Example: "How will it affect LESS rabbits?"
  * You're asking about the phenomenon of "having fewer rabbits"
  * If intervention causes rabbits to decrease → "LESS rabbits" phenomenon INCREASES → answer: MORE
  * If intervention causes rabbits to increase → "LESS rabbits" phenomenon DECREASES → answer: LESS

If this is a DIRECT question:
- Simply determine if the intervention increases, decreases, or doesn't affect the target

TASK:
Provide your final answer with complete reasoning.

Return JSON format:
{{
  "final_answer": "more|less|no_effect",
  "rationale": "Complete step-by-step explanation of your reasoning (3-5 sentences)",
  "confidence": 0.0-1.0
}}

Return ONLY valid JSON."""

    try:
        response = ollama.generate(model=model, prompt=combined_prompt)
        llm_response = response.get("response", "").strip()
        
        print("\n" + "="*60)
        print("LLM Response:")
        print("="*60)
        print(llm_response)
        
        # Parse JSON response
        import re
        json_match = re.search(r'\{[^}]+\}', llm_response, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group())
            final_answer = normalize_label(result.get('final_answer', ''))
            rationale = result.get('rationale', '')
            confidence = result.get('confidence', 0.0)
        else:
            # Fallback parsing
            final_answer = None
            rationale = llm_response
            confidence = 0.0
            for label in ['more', 'less', 'no_effect']:
                if label in llm_response.lower():
                    final_answer = label
                    break
        
        print("\n" + "="*60)
        print(f"Final Answer: {final_answer}")
        print(f"Confidence: {confidence}")
        print(f"Rationale: {rationale}")
        print("="*60)
        
        return {
            'method': 'combined_context_llm',
            'question_structure': question_structure,
            'final_answer': final_answer,
            'rationale': rationale,
            'confidence': confidence,
            'raw_response': llm_response
        }
        
    except Exception as e:
        print(f"Error in combined context LLM prediction: {e}")
        return {
            'method': 'combined_context_llm',
            'question_structure': question_structure,
            'final_answer': 'error',
            'rationale': str(e),
            'confidence': 0.0
        }


print("Two new LLM-direct methods defined:")
print("  1. predict_meta_informed_llm - Uses parsed meta info")
print("  2. predict_combined_context_llm - Uses full context with reasoning guide")

Two new LLM-direct methods defined:
  1. predict_meta_informed_llm - Uses parsed meta info
  2. predict_combined_context_llm - Uses full context with reasoning guide


In [14]:
# Test and compare all three methods

test_cases = [
    {
        'question': 'suppose the seedling is not eaten happens, how will it affect LESS trees?',
        'ground_truth': 'less',
        'description': 'Meta-level LESS question'
    },
    {
        'question': 'suppose less oil delivered happens, how will it affect more paper available?',
        'ground_truth': 'no_effect',
        'description': 'no_effect causal question'
    },
    {
        'question': 'ssuppose you inhale more air from the outside happens, how will it affect there will be less oxygen in your blood?',
        'ground_truth': 'less',
        'description': 'Meta-level MORE question'
    }
]

print("=" * 80)
print("COMPARING THREE METHODS")
print("=" * 80)

results_comparison = []

for i, test in enumerate(test_cases, 1):
    question = test['question']
    expected = test['ground_truth']
    description = test['description']
    
    print(f"\n{'='*80}")
    print(f"TEST CASE {i}: {description}")
    print(f"{'='*80}")
    print(f"Question: {question}")
    print(f"Expected: {expected}\n")
    
    # Method 1: Meta-Informed LLM
    print("\n" + "▶"*40)
    print("Testing Method 1: Meta-Informed LLM")
    print("▶"*40)
    result1 = predict_meta_informed_llm(question, parser, BUILDER, MODEL)
    
    # Method 2: Combined Context LLM
    print("\n" + "▶"*40)
    print("Testing Method 2: Combined Context LLM")
    print("▶"*40)
    result2 = predict_combined_context_llm(question, parser, BUILDER, MODEL)
    
    # Method 3: Original Pipeline with Meta Reasoning
    print("\n" + "▶"*40)
    print("Testing Method 3: Original Pipeline (for comparison)")
    print("▶"*40)
    result3 = predict_with_meta_reasoning(question, BUILDER, parser, MODEL)
    
    # Compare results
    comparison = {
        'question': question,
        'expected': expected,
        'description': description,
        'method1_answer': result1['final_answer'],
        'method1_correct': result1['final_answer'] == expected,
        'method2_answer': result2['final_answer'],
        'method2_correct': result2['final_answer'] == expected,
        'method3_answer': result3['final_answer'],
        'method3_correct': result3['final_answer'] == expected,
    }
    results_comparison.append(comparison)
    
    print("\n" + "="*80)
    print("COMPARISON SUMMARY")
    print("="*80)
    print(f"Expected Answer: {expected}")
    print(f"Method 1 (Meta-Informed LLM): {result1['final_answer']} {'✓' if comparison['method1_correct'] else '✗'}")
    print(f"Method 2 (Combined Context LLM): {result2['final_answer']} {'✓' if comparison['method2_correct'] else '✗'}")
    print(f"Method 3 (Original Pipeline): {result3['final_answer']} {'✓' if comparison['method3_correct'] else '✗'}")
    print("="*80)

# Final summary
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

df_comparison = pd.DataFrame(results_comparison)
print(df_comparison[['description', 'expected', 'method1_answer', 'method1_correct', 
                      'method2_answer', 'method2_correct', 'method3_answer', 'method3_correct']])

print("\n" + "="*80)
print("ACCURACY BY METHOD")
print("="*80)
print(f"Method 1 (Meta-Informed LLM): {df_comparison['method1_correct'].sum()}/{len(df_comparison)} = {df_comparison['method1_correct'].mean():.2%}")
print(f"Method 2 (Combined Context LLM): {df_comparison['method2_correct'].sum()}/{len(df_comparison)} = {df_comparison['method2_correct'].mean():.2%}")
print(f"Method 3 (Original Pipeline): {df_comparison['method3_correct'].sum()}/{len(df_comparison)} = {df_comparison['method3_correct'].mean():.2%}")
print("="*80)

COMPARING THREE METHODS

TEST CASE 1: Meta-level LESS question
Question: suppose the seedling is not eaten happens, how will it affect LESS trees?
Expected: less


▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶
Testing Method 1: Meta-Informed LLM
▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶
METHOD 1: Meta-Informed LLM Decision

Question Structure:
{
  "is_meta_level": true,
  "intervention": "the seedling is not eaten happens",
  "target_phrase": "LESS trees",
  "target_direction": "less",
  "target_entity": "trees",
  "question_type": "meta"
}
[17:11:11] 
[17:11:11] STEP 1: Extract Seed Entities
[17:11:12] 
LLM Response:
seedling
trees
[17:11:12] 
 Found 2 seeds: {'trees', 'seedling'}
[17:11:12] 
[17:11:12] EXPANSION DEPTH 1/3
[17:11:12] Expanding relations for: trees
[17:11:12] Context entities: trees, seedling
[17:11:20] LLM Response:
```json
[
  {
    "head": "trees",
    "relation": "produces",
    "tail": "seeds",
    "description": "Mature trees produce seeds through reproductive processe

In [None]:
# Quick test of Method 1 on the original example
print("="*80)
print("QUICK TEST: Method 1 - Meta-Informed LLM")
print("="*80)

test_question = 'suppose the female is sterile happens, how will it affect LESS rabbits.'
print(f"\nQuestion: {test_question}")
print(f"Expected: more\n")

result = predict_meta_informed_llm(test_question, parser, BUILDER, MODEL)

In [None]:
# Quick test of Method 2 on the original example
print("="*80)
print("QUICK TEST: Method 2 - Combined Context LLM")
print("="*80)

test_question = 'suppose the female is sterile happens, how will it affect LESS rabbits.'
print(f"\nQuestion: {test_question}")
print(f"Expected: more\n")

result = predict_combined_context_llm(test_question, parser, BUILDER, MODEL)