In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import re
import math
from typing import List, Tuple, Dict
from collections import defaultdict

class BeliefMeasurementPipeline:
    """
    A pipeline to measure belief alignment between a proposition and free-form text.
    Returns a score in [0,1] where 0=strongly disagrees, 0.5=neutral, 1=strongly agrees.
    """
    
    def __init__(self):
        # Agreement indicators with weights
        self.agreement_markers = {
            'strong_agree': {
                'patterns': [r'\b(absolutely|completely|totally|entirely|fully)\s+(agree|support|believe|endorse)',
                           r'\b(strongly|firmly)\s+(agree|support|believe|endorse)',
                           r'\b(without\s+doubt|undoubtedly|certainly|definitely)\b.*\b(true|correct|right)\b',
                           r'\bI\s+(completely|totally|absolutely|strongly)\s+(agree|support|believe)\b'],
                'weight': 0.9
            },
            'agree': {
                'patterns': [r'\b(agree|support|believe|endorse|accept|embrace)\b',
                           r'\b(yes|correct|true|right|accurate|valid)\b',
                           r'\bI\s+(think|believe|feel)\s+.*\b(true|correct|right)\b'],
                'weight': 0.7
            },
            'weak_agree': {
                'patterns': [r'\b(somewhat|partially|largely|mostly|generally)\s+(agree|support|believe)',
                           r'\b(tend\s+to|inclined\s+to)\s+(agree|believe|support)',
                           r'\b(probably|likely)\s+(true|correct|right)\b'],
                'weight': 0.6
            }
        }
        
        # Disagreement indicators
        self.disagreement_markers = {
            'strong_disagree': {
                'patterns': [r'\b(absolutely|completely|totally|entirely|strongly)\s+(disagree|oppose|reject)',
                           r'\b(never|not\s+at\s+all|in\s+no\s+way)\b',
                           r'\b(completely|utterly|totally)\s+(false|wrong|incorrect)',
                           r'\bI\s+(completely|totally|absolutely|strongly)\s+(disagree|oppose|reject)\b'],
                'weight': 0.1
            },
            'disagree': {
                'patterns': [r'\b(disagree|oppose|reject|deny|refute|dispute)\b',
                           r'\b(no|false|wrong|incorrect|invalid|untrue)\b',
                           r'\bI\s+(don\'?t|do\s+not)\s+(agree|believe|support|think)\b'],
                'weight': 0.3
            },
            'weak_disagree': {
                'patterns': [r'\b(somewhat|partially)\s+(disagree|oppose)',
                           r'\b(doubt|question|skeptical|uncertain)\b',
                           r'\b(probably|likely)\s+(false|wrong|incorrect)\b'],
                'weight': 0.4
            }
        }
        
        # Neutral indicators
        self.neutral_markers = {
            'patterns': [r'\b(neutral|undecided|unsure|uncertain|mixed\s+feelings)\b',
                        r'\b(both\s+sides|on\s+one\s+hand.*on\s+the\s+other)\b',
                        r'\b(depends|it\'?s\s+complicated|not\s+sure)\b'],
            'weight': 0.5
        }
        
    def handle_negations(self, text: str) -> str:
        """Handle negations that might flip the meaning."""
        # Simple negation patterns
        negation_patterns = [
            (r'\b(do\s+not|don\'t|does\s+not|doesn\'t|did\s+not|didn\'t)\s+', ''),
            (r'\b(not|never|no)\s+', ''),
            (r'\b(isn\'t|aren\'t|wasn\'t|weren\'t|won\'t|wouldn\'t|can\'t|couldn\'t)\b', ''),
        ]
        
        negation_count = 0
        processed_text = text.lower()
        
        for pattern, replacement in negation_patterns:
            matches = re.findall(pattern, processed_text, re.IGNORECASE)
            negation_count += len(matches)
            processed_text = re.sub(pattern, replacement, processed_text, flags=re.IGNORECASE)
        
        return processed_text, negation_count % 2 == 1  # Odd number of negations = flipped

    def extract_agreement_signals(self, text: str, proposition: str) -> Dict[str, float]:
        """Extract agreement/disagreement signals from text."""
        text_lower = text.lower()
        prop_lower = proposition.lower()
        
        # Handle negations
        processed_text, is_negated = self.handle_negations(text)
        
        # Combine text and proposition for context-aware matching
        combined_text = f"{prop_lower} {processed_text}"
        
        signals = defaultdict(list)
        
        # Check agreement markers
        for level, markers in self.agreement_markers.items():
            for pattern in markers['patterns']:
                matches = re.findall(pattern, combined_text, re.IGNORECASE)
                if matches:
                    weight = markers['weight']
                    if is_negated:
                        weight = 1.0 - weight  # Flip due to negation
                    signals['agreement'].extend([weight] * len(matches))
        
        # Check disagreement markers
        for level, markers in self.disagreement_markers.items():
            for pattern in markers['patterns']:
                matches = re.findall(pattern, combined_text, re.IGNORECASE)
                if matches:
                    weight = markers['weight']
                    if is_negated:
                        weight = 1.0 - weight  # Flip due to negation
                    signals['disagreement'].extend([weight] * len(matches))
        
        # Check neutral markers
        for pattern in self.neutral_markers['patterns']:
            matches = re.findall(pattern, combined_text, re.IGNORECASE)
            if matches:
                signals['neutral'].extend([self.neutral_markers['weight']] * len(matches))
        
        return dict(signals)
    
    def calculate_semantic_similarity(self, text: str, proposition: str) -> float:
        """
        Simple semantic similarity based on word overlap and key terms.
        In a production system, this would use embeddings.
        """
        # Tokenize and clean
        def tokenize(s):
            return set(re.findall(r'\b\w+\b', s.lower()))
        
        text_tokens = tokenize(text)
        prop_tokens = tokenize(proposition)
        
        if not prop_tokens:
            return 0.5
        
        # Calculate overlap
        intersection = text_tokens.intersection(prop_tokens)
        union = text_tokens.union(prop_tokens)
        
        if not union:
            return 0.5
        
        jaccard_similarity = len(intersection) / len(union)
        
        # Boost for exact proposition mentions
        prop_words = proposition.lower().split()
        text_lower = text.lower()
        
        exact_mentions = sum(1 for word in prop_words if word in text_lower)
        mention_boost = min(exact_mentions / len(prop_words), 0.3)
        
        return min(jaccard_similarity + mention_boost, 1.0)
    
    def analyze_context_sentiment(self, text: str, proposition: str) -> float:
        """
        Analyze sentiment in context of the proposition.
        Simple implementation - in production would use proper sentiment analysis.
        """
        text_lower = text.lower()
        
        # Positive sentiment words
        positive_words = ['good', 'great', 'excellent', 'wonderful', 'amazing', 'fantastic',
                         'beneficial', 'helpful', 'valuable', 'important', 'necessary',
                         'essential', 'vital', 'crucial', 'significant']
        
        # Negative sentiment words
        negative_words = ['bad', 'terrible', 'awful', 'horrible', 'wrong', 'harmful',
                         'dangerous', 'problematic', 'concerning', 'worrying', 'alarming',
                         'unnecessary', 'useless', 'pointless', 'wasteful']
        
        positive_count = sum(1 for word in positive_words if word in text_lower)
        negative_count = sum(1 for word in negative_words if word in text_lower)
        
        if positive_count == 0 and negative_count == 0:
            return 0.5
        
        total = positive_count + negative_count
        return positive_count / total
    
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """
        Main method to measure belief alignment.
        Returns belief score and detailed breakdown.
        """
        if not proposition.strip() or not text.strip():
            return 0.5, {'error': 'Empty proposition or text'}
        
        # Extract signals
        signals = self.extract_agreement_signals(text, proposition)
        
        # Calculate components
        semantic_sim = self.calculate_semantic_similarity(text, proposition)
        context_sentiment = self.analyze_context_sentiment(text, proposition)
        
        # Aggregate agreement/disagreement signals
        agreement_scores = signals.get('agreement', [])
        disagreement_scores = signals.get('disagreement', [])
        neutral_scores = signals.get('neutral', [])
        
        # Calculate weighted averages
        if agreement_scores:
            avg_agreement = sum(agreement_scores) / len(agreement_scores)
        else:
            avg_agreement = None
            
        if disagreement_scores:
            avg_disagreement = sum(disagreement_scores) / len(disagreement_scores)
        else:
            avg_disagreement = None
        
        # Combine signals with weights
        components = []
        weights = []
        
        # Primary signals (explicit agreement/disagreement)
        if avg_agreement is not None:
            components.append(avg_agreement)
            weights.append(0.5)  # High weight for explicit agreement
        
        if avg_disagreement is not None:
            components.append(avg_disagreement)
            weights.append(0.5)  # High weight for explicit disagreement
        
        # Secondary signals
        if semantic_sim > 0:
            components.append(0.5 + (semantic_sim - 0.5) * 0.3)  # Moderate influence
            weights.append(0.2)
        
        if context_sentiment != 0.5:
            components.append(context_sentiment)
            weights.append(0.2)
        
        # Handle neutral signals
        if neutral_scores:
            components.append(0.5)
            weights.append(0.3)
        
        # Default to neutral if no signals
        if not components:
            final_score = 0.5
        else:
            # Weighted average
            final_score = sum(c * w for c, w in zip(components, weights)) / sum(weights)
        
        # Ensure score is in [0, 1]
        final_score = max(0.0, min(1.0, final_score))
        
        # Prepare detailed breakdown
        breakdown = {
            'final_score': final_score,
            'semantic_similarity': semantic_sim,
            'context_sentiment': context_sentiment,
            'agreement_signals': len(agreement_scores),
            'disagreement_signals': len(disagreement_scores),
            'neutral_signals': len(neutral_scores),
            'avg_agreement': avg_agreement,
            'avg_disagreement': avg_disagreement,
            'components_used': len(components)
        }
        
        return final_score, breakdown


# Test the pipeline with diverse cases
def test_pipeline():
    pipeline = BeliefMeasurementPipeline()
    
    test_cases = [
        {
            'proposition': 'Climate change is a serious threat',
            'text': 'I absolutely agree that climate change poses a significant danger to our planet.',
            'expected_range': (0.8, 1.0)
        },
        {
            'proposition': 'Climate change is a serious threat',
            'text': 'Climate change is completely overblown and not a real problem at all.',
            'expected_range': (0.0, 0.3)
        },
        {
            'proposition': 'Exercise is beneficial for health',
            'text': 'I think exercise can be helpful but it depends on the person and situation.',
            'expected_range': (0.5, 0.7)
        },
        {
            'proposition': 'Cats are better than dogs',
            'text': 'I love both cats and dogs equally. Both make wonderful pets.',
            'expected_range': (0.4, 0.6)
        },
        {
            'proposition': 'Remote work increases productivity',
            'text': 'In my experience, working from home has made me much more efficient and productive.',
            'expected_range': (0.7, 0.9)
        }
    ]
    
    print("Testing Belief Measurement Pipeline")
    print("=" * 50)
    
    for i, case in enumerate(test_cases, 1):
        score, breakdown = pipeline.measure_belief(case['proposition'], case['text'])
        
        print(f"\nTest Case {i}:")
        print(f"Proposition: {case['proposition']}")
        print(f"Text: {case['text']}")
        print(f"Score: {score:.3f}")
        print(f"Expected Range: {case['expected_range']}")
        
        # Check if score is in expected range
        in_range = case['expected_range'][0] <= score <= case['expected_range'][1]
        print(f"✓ In Expected Range: {in_range}")
        
        print(f"Breakdown: {breakdown}")
        print("-" * 30)

# Additional test cases for more comprehensive evaluation
def extended_test():
    pipeline = BeliefMeasurementPipeline()
    
    extended_cases = [
        {
            'name': 'Strong Agreement with Reasoning',
            'proposition': 'Education is important for society',
            'text': 'I firmly believe education is crucial. It empowers individuals, drives innovation, and creates a more informed citizenry.',
            'expected_range': (0.8, 1.0)
        },
        {
            'name': 'Subtle Disagreement',
            'proposition': 'Social media is beneficial for mental health',
            'text': 'While social media connects people, I question whether it truly helps mental health. Studies suggest it might actually increase anxiety.',
            'expected_range': (0.2, 0.5)
        },
        {
            'name': 'Neutral with Complexity',
            'proposition': 'Artificial intelligence will replace most jobs',
            'text': 'AI will certainly change the job market. Some roles will disappear, others will emerge. The net effect is hard to predict.',
            'expected_range': (0.4, 0.6)
        },
        {
            'name': 'Strong Disagreement',
            'proposition': 'Vaccines are dangerous',
            'text': 'I completely disagree. Vaccines are safe and effective. This claim is utterly false and contradicts scientific evidence.',
            'expected_range': (0.0, 0.2)
        },
        {
            'name': 'No Clear Opinion',
            'proposition': 'Pizza is the best food',
            'text': 'I went to the store yesterday and bought some groceries. The weather was nice.',
            'expected_range': (0.4, 0.6)
        }
    ]
    
    print("\nExtended Test Cases")
    print("=" * 50)
    
    for case in extended_cases:
        score, breakdown = pipeline.measure_belief(case['proposition'], case['text'])
        
        print(f"\nCase: {case['name']}")
        print(f"Proposition: {case['proposition']}")
        print(f"Text: {case['text']}")
        print(f"Score: {score:.3f}")
        print(f"Expected: {case['expected_range']}")
        
        in_range = case['expected_range'][0] <= score <= case['expected_range'][1]
        print(f"✓ Result: {'PASS' if in_range else 'REVIEW'}")

# Simple CLI interface for interactive testing
def interactive_mode():
    """Interactive mode for testing the pipeline."""
    pipeline = BeliefMeasurementPipeline()
    print("Belief Measurement Pipeline - Interactive Mode")
    print("=" * 50)
    print("Enter 'quit' to exit\n")
    
    while True:
        proposition = input("Enter proposition: ").strip()
        if proposition.lower() == 'quit':
            break
            
        text = input("Enter text to analyze: ").strip()
        if text.lower() == 'quit':
            break
        
        if not proposition or not text:
            print("Both proposition and text are required.\n")
            continue
        
        score, breakdown = pipeline.measure_belief(proposition, text)
        
        print(f"\n--- Results ---")
        print(f"Belief Score: {score:.3f}")
        
        # Interpret score
        if score >= 0.8:
            interpretation = "Strong Agreement"
        elif score >= 0.6:
            interpretation = "Agreement"
        elif score >= 0.4:
            interpretation = "Neutral/Mixed"
        elif score >= 0.2:
            interpretation = "Disagreement"
        else:
            interpretation = "Strong Disagreement"
        
        print(f"Interpretation: {interpretation}")
        print(f"Agreement signals: {breakdown['agreement_signals']}")
        print(f"Disagreement signals: {breakdown['disagreement_signals']}")
        print(f"Semantic similarity: {breakdown['semantic_similarity']:.3f}")
        print("-" * 30)

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1 and sys.argv[1] == 'interactive':
        interactive_mode()
    else:
        test_pipeline()
        extended_test()

Testing Belief Measurement Pipeline

Test Case 1:
Proposition: Climate change is a serious threat
Text: I absolutely agree that climate change poses a significant danger to our planet.
Score: 0.795
Expected Range: (0.8, 1.0)
✓ In Expected Range: False
Breakdown: {'final_score': 0.7954629629629631, 'semantic_similarity': 0.4875, 'context_sentiment': 1.0, 'agreement_signals': 3, 'disagreement_signals': 0, 'neutral_signals': 0, 'avg_agreement': 0.8333333333333334, 'avg_disagreement': None, 'components_used': 3}
------------------------------

Test Case 2:
Proposition: Climate change is a serious threat
Text: Climate change is completely overblown and not a real problem at all.
Score: 0.526
Expected Range: (0.0, 0.3)
✓ In Expected Range: False
Breakdown: {'final_score': 0.5257142857142857, 'semantic_similarity': 0.5857142857142856, 'context_sentiment': 0.5, 'agreement_signals': 0, 'disagreement_signals': 0, 'neutral_signals': 0, 'avg_agreement': None, 'avg_disagreement': None, 'components_

In [4]:
# import re
# import math
# import json
# from typing import List, Tuple, Dict, Optional
# from collections import defaultdict
# from abc import ABC, abstractmethod

# # Abstract base class for different pipeline approaches
# class BeliefMeasurementStrategy(ABC):
#     @abstractmethod
#     def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         pass

# class PromptingBasedStrategy(BeliefMeasurementStrategy):
#     """
#     Prompting-based approach that uses structured prompts to evaluate belief alignment.
#     Simulates LLM reasoning through rule-based prompt interpretation.
#     """
    
#     def __init__(self):
#         self.prompt_templates = {
#             'direct_assessment': """
#             Analyze the following text to determine how much it agrees with the given proposition.
            
#             Proposition: "{proposition}"
#             Text: "{text}"
            
#             Consider:
#             1. Does the text explicitly agree or disagree with the proposition?
#             2. What is the strength of the agreement/disagreement?
#             3. Are there qualifying statements that moderate the position?
#             4. Is the text neutral or irrelevant to the proposition?
            
#             Rate agreement on scale 0-10 where:
#             0-1: Strong disagreement
#             2-3: Disagreement  
#             4-6: Neutral/Mixed
#             7-8: Agreement
#             9-10: Strong agreement
#             """,
            
#             'comparative_assessment': """
#             Compare how much the text supports vs opposes the proposition:
            
#             Proposition: "{proposition}"
#             Text: "{text}"
            
#             Evidence FOR the proposition in the text:
#             Evidence AGAINST the proposition in the text:
            
#             Overall assessment (0-10 scale):
#             """,
            
#             'reasoning_chain': """
#             Step-by-step analysis:
            
#             Proposition: "{proposition}"
#             Text: "{text}"
            
#             Step 1: What is the main claim in the text?
#             Step 2: How does this claim relate to the proposition?
#             Step 3: What supporting evidence is provided?
#             Step 4: Are there any contradictions or qualifications?
#             Step 5: Overall belief alignment (0-10):
#             """
#         }
    
#     def _simulate_llm_response(self, prompt: str, proposition: str, text: str) -> Dict:
#         """
#         Simulate LLM response using rule-based reasoning.
#         In production, this would call an actual LLM API.
#         """
#         text_lower = text.lower()
#         prop_lower = proposition.lower()
        
#         # Extract key phrases from proposition
#         prop_keywords = set(re.findall(r'\b\w+\b', prop_lower))
#         text_keywords = set(re.findall(r'\b\w+\b', text_lower))
        
#         keyword_overlap = len(prop_keywords.intersection(text_keywords)) / max(len(prop_keywords), 1)
        
#         # Analyze explicit stances
#         strong_positive = any(phrase in text_lower for phrase in [
#             'strongly agree', 'completely agree', 'absolutely', 'definitely true',
#             'without doubt', 'certainly', 'undoubtedly'
#         ])
        
#         positive = any(phrase in text_lower for phrase in [
#             'agree', 'support', 'believe', 'yes', 'correct', 'true', 'right'
#         ])
        
#         strong_negative = any(phrase in text_lower for phrase in [
#             'strongly disagree', 'completely wrong', 'absolutely not', 'definitely false',
#             'never', 'utterly false', 'completely disagree'
#         ])
        
#         negative = any(phrase in text_lower for phrase in [
#             'disagree', 'oppose', 'no', 'false', 'wrong', 'incorrect', 'dispute'
#         ])
        
#         neutral = any(phrase in text_lower for phrase in [
#             'neutral', 'mixed', 'depends', 'complicated', 'unsure', 'both sides'
#         ])
        
#         # Simulate reasoning
#         reasoning = {
#             'keyword_overlap': keyword_overlap,
#             'explicit_stance': None,
#             'confidence': 0.5,
#             'evidence_for': [],
#             'evidence_against': [],
#             'qualifications': []
#         }
        
#         # Determine stance
#         if strong_positive:
#             reasoning['explicit_stance'] = 'strong_agree'
#             reasoning['confidence'] = 0.9
#             reasoning['score'] = 9
#         elif positive:
#             reasoning['explicit_stance'] = 'agree'
#             reasoning['confidence'] = 0.7
#             reasoning['score'] = 7
#         elif strong_negative:
#             reasoning['explicit_stance'] = 'strong_disagree'
#             reasoning['confidence'] = 0.9
#             reasoning['score'] = 1
#         elif negative:
#             reasoning['explicit_stance'] = 'disagree'
#             reasoning['confidence'] = 0.7
#             reasoning['score'] = 3
#         elif neutral:
#             reasoning['explicit_stance'] = 'neutral'
#             reasoning['confidence'] = 0.6
#             reasoning['score'] = 5
#         else:
#             # Use keyword overlap and context
#             if keyword_overlap > 0.5:
#                 reasoning['score'] = 5 + int(keyword_overlap * 3)
#             else:
#                 reasoning['score'] = 5  # Default neutral
        
#         # Adjust based on keyword overlap
#         if reasoning.get('score'):
#             if keyword_overlap < 0.2:
#                 reasoning['score'] = max(4, min(6, reasoning['score']))  # Force toward neutral
        
#         return reasoning
    
#     def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         """Measure belief using prompting-based approach."""
#         if not proposition.strip() or not text.strip():
#             return 0.5, {'error': 'Empty proposition or text', 'method': 'prompting'}
        
#         # Use multiple prompt templates for robust assessment
#         results = []
#         detailed_analysis = {}
        
#         for template_name, template in self.prompt_templates.items():
#             prompt = template.format(proposition=proposition, text=text)
#             response = self._simulate_llm_response(prompt, proposition, text)
            
#             if 'score' in response:
#                 # Convert 0-10 scale to 0-1
#                 normalized_score = response['score'] / 10.0
#                 results.append(normalized_score)
#                 detailed_analysis[template_name] = response
        
#         # Ensemble the results
#         if results:
#             final_score = sum(results) / len(results)
#         else:
#             final_score = 0.5
        
#         # Ensure score is in [0, 1]
#         final_score = max(0.0, min(1.0, final_score))
        
#         breakdown = {
#             'final_score': final_score,
#             'method': 'prompting',
#             'individual_scores': results,
#             'detailed_analysis': detailed_analysis,
#             'num_prompts_used': len(results)
#         }
        
#         return final_score, breakdown


# class LogProbBasedStrategy(BeliefMeasurementStrategy):
#     """
#     LogProb-based approach that estimates belief by comparing log probabilities
#     of agreement vs disagreement continuations.
#     """
    
#     def __init__(self):
#         # Simulated vocabulary with log probabilities
#         # In production, this would use actual model logprobs
#         self.vocab_logprobs = self._build_simulated_vocab()
        
#         # Template continuations to test
#         self.agreement_templates = [
#             "This statement is {token}",
#             "I {token} with this",
#             "This is {token}",
#             "The text {token} the proposition"
#         ]
        
#         self.agreement_tokens = ['true', 'correct', 'right', 'accurate', 'agree', 'support']
#         self.disagreement_tokens = ['false', 'wrong', 'incorrect', 'disagree', 'oppose', 'dispute']
#         self.neutral_tokens = ['unclear', 'mixed', 'neutral', 'uncertain']
    
#     def _build_simulated_vocab(self) -> Dict[str, float]:
#         """Build simulated vocabulary with log probabilities."""
#         # Simulate log probabilities for common words
#         # In production, these would come from actual model
#         vocab = {
#             'true': -1.5, 'false': -1.8, 'correct': -2.1, 'wrong': -1.9,
#             'right': -1.7, 'incorrect': -2.3, 'accurate': -2.8,
#             'agree': -1.6, 'disagree': -2.0, 'support': -2.2, 'oppose': -2.5,
#             'dispute': -3.1, 'unclear': -2.7, 'mixed': -2.9, 'neutral': -2.4,
#             'uncertain': -2.8, 'definitely': -2.3, 'probably': -1.8,
#             'maybe': -2.1, 'never': -2.2, 'always': -2.4
#         }
#         return vocab
    
#     def _calculate_context_logprob(self, context: str, token: str) -> float:
#         """
#         Calculate log probability of token given context.
#         Simulates model behavior based on context analysis.
#         """
#         base_logprob = self.vocab_logprobs.get(token, -4.0)  # Default for unknown tokens
        
#         context_lower = context.lower()
        
#         # Boost probability if context supports the token
#         if token in ['true', 'correct', 'right', 'accurate', 'agree', 'support']:
#             if any(phrase in context_lower for phrase in ['i think', 'believe', 'support', 'agree']):
#                 base_logprob += 1.0
#             elif any(phrase in context_lower for phrase in ['disagree', 'oppose', 'wrong']):
#                 base_logprob -= 1.5
        
#         elif token in ['false', 'wrong', 'incorrect', 'disagree', 'oppose']:
#             if any(phrase in context_lower for phrase in ['disagree', 'oppose', 'wrong']):
#                 base_logprob += 1.0
#             elif any(phrase in context_lower for phrase in ['agree', 'support', 'correct']):
#                 base_logprob -= 1.5
        
#         # Context relevance boost
#         prop_words = set(re.findall(r'\b\w+\b', context_lower))
#         if token in prop_words:
#             base_logprob += 0.5
        
#         return base_logprob
    
#     def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         """Measure belief using log probability approach."""
#         if not proposition.strip() or not text.strip():
#             return 0.5, {'error': 'Empty proposition or text', 'method': 'logprob'}
        
#         # Create context by combining proposition and text
#         context = f"Proposition: {proposition}\nText: {text}\n"
        
#         # Calculate log probabilities for different token types
#         agreement_logprobs = []
#         disagreement_logprobs = []
#         neutral_logprobs = []
        
#         for template in self.agreement_templates:
#             for token in self.agreement_tokens:
#                 filled_template = template.format(token=token)
#                 full_context = context + filled_template
#                 logprob = self._calculate_context_logprob(full_context, token)
#                 agreement_logprobs.append(logprob)
            
#             for token in self.disagreement_tokens:
#                 filled_template = template.format(token=token)
#                 full_context = context + filled_template
#                 logprob = self._calculate_context_logprob(full_context, token)
#                 disagreement_logprobs.append(logprob)
            
#             for token in self.neutral_tokens:
#                 filled_template = template.format(token=token)
#                 full_context = context + filled_template
#                 logprob = self._calculate_context_logprob(full_context, token)
#                 neutral_logprobs.append(logprob)
        
#         # Calculate average log probabilities
#         avg_agreement = sum(agreement_logprobs) / len(agreement_logprobs) if agreement_logprobs else -4.0
#         avg_disagreement = sum(disagreement_logprobs) / len(disagreement_logprobs) if disagreement_logprobs else -4.0
#         avg_neutral = sum(neutral_logprobs) / len(neutral_logprobs) if neutral_logprobs else -4.0
        
#         # Convert to probabilities using softmax
#         logprobs = [avg_agreement, avg_disagreement, avg_neutral]
#         max_logprob = max(logprobs)
#         normalized_logprobs = [lp - max_logprob for lp in logprobs]
        
#         exp_probs = [math.exp(lp) for lp in normalized_logprobs]
#         sum_exp = sum(exp_probs)
        
#         prob_agreement = exp_probs[0] / sum_exp
#         prob_disagreement = exp_probs[1] / sum_exp
#         prob_neutral = exp_probs[2] / sum_exp
        
#         # Convert to belief score (0 = disagree, 0.5 = neutral, 1 = agree)
#         belief_score = prob_agreement + 0.5 * prob_neutral
        
#         breakdown = {
#             'final_score': belief_score,
#             'method': 'logprob',
#             'prob_agreement': prob_agreement,
#             'prob_disagreement': prob_disagreement,
#             'prob_neutral': prob_neutral,
#             'avg_agreement_logprob': avg_agreement,
#             'avg_disagreement_logprob': avg_disagreement,
#             'avg_neutral_logprob': avg_neutral,
#             'num_templates': len(self.agreement_templates)
#         }
        
#         return belief_score, breakdown

# class PatternBasedStrategy(BeliefMeasurementStrategy):
#     """
#     Original pattern-based approach using regex patterns and rule-based analysis.
#     A pipeline to measure belief alignment between a proposition and free-form text.
#     Returns a score in [0,1] where 0=strongly disagrees, 0.5=neutral, 1=strongly agrees.
#     """
    
#     def __init__(self):
#         # Agreement indicators with weights
#         self.agreement_markers = {
#             'strong_agree': {
#                 'patterns': [r'\b(absolutely|completely|totally|entirely|fully)\s+(agree|support|believe|endorse)',
#                            r'\b(strongly|firmly)\s+(agree|support|believe|endorse)',
#                            r'\b(without\s+doubt|undoubtedly|certainly|definitely)\b.*\b(true|correct|right)\b',
#                            r'\bI\s+(completely|totally|absolutely|strongly)\s+(agree|support|believe)\b'],
#                 'weight': 0.9
#             },
#             'agree': {
#                 'patterns': [r'\b(agree|support|believe|endorse|accept|embrace)\b',
#                            r'\b(yes|correct|true|right|accurate|valid)\b',
#                            r'\bI\s+(think|believe|feel)\s+.*\b(true|correct|right)\b'],
#                 'weight': 0.7
#             },
#             'weak_agree': {
#                 'patterns': [r'\b(somewhat|partially|largely|mostly|generally)\s+(agree|support|believe)',
#                            r'\b(tend\s+to|inclined\s+to)\s+(agree|believe|support)',
#                            r'\b(probably|likely)\s+(true|correct|right)\b'],
#                 'weight': 0.6
#             }
#         }
        
#         # Disagreement indicators
#         self.disagreement_markers = {
#             'strong_disagree': {
#                 'patterns': [r'\b(absolutely|completely|totally|entirely|strongly)\s+(disagree|oppose|reject)',
#                            r'\b(never|not\s+at\s+all|in\s+no\s+way)\b',
#                            r'\b(completely|utterly|totally)\s+(false|wrong|incorrect)',
#                            r'\bI\s+(completely|totally|absolutely|strongly)\s+(disagree|oppose|reject)\b'],
#                 'weight': 0.1
#             },
#             'disagree': {
#                 'patterns': [r'\b(disagree|oppose|reject|deny|refute|dispute)\b',
#                            r'\b(no|false|wrong|incorrect|invalid|untrue)\b',
#                            r'\bI\s+(don\'?t|do\s+not)\s+(agree|believe|support|think)\b'],
#                 'weight': 0.3
#             },
#             'weak_disagree': {
#                 'patterns': [r'\b(somewhat|partially)\s+(disagree|oppose)',
#                            r'\b(doubt|question|skeptical|uncertain)\b',
#                            r'\b(probably|likely)\s+(false|wrong|incorrect)\b'],
#                 'weight': 0.4
#             }
#         }
        
#         # Neutral indicators
#         self.neutral_markers = {
#             'patterns': [r'\b(neutral|undecided|unsure|uncertain|mixed\s+feelings)\b',
#                         r'\b(both\s+sides|on\s+one\s+hand.*on\s+the\s+other)\b',
#                         r'\b(depends|it\'?s\s+complicated|not\s+sure)\b'],
#             'weight': 0.5
#         }
        
#     def handle_negations(self, text: str) -> Tuple[str, bool]:
#         """Handle negations that might flip the meaning."""
#         # Simple negation patterns
#         negation_patterns = [
#             (r'\b(do\s+not|don\'t|does\s+not|doesn\'t|did\s+not|didn\'t)\s+', ''),
#             (r'\b(not|never|no)\s+', ''),
#             (r'\b(isn\'t|aren\'t|wasn\'t|weren\'t|won\'t|wouldn\'t|can\'t|couldn\'t)\b', ''),
#         ]
        
#         negation_count = 0
#         processed_text = text.lower()
        
#         for pattern, replacement in negation_patterns:
#             matches = re.findall(pattern, processed_text, re.IGNORECASE)
#             negation_count += len(matches)
#             processed_text = re.sub(pattern, replacement, processed_text, flags=re.IGNORECASE)
        
#         return processed_text, negation_count % 2 == 1  # Odd number of negations = flipped

#     def extract_agreement_signals(self, text: str, proposition: str) -> Dict[str, List[float]]:
#         """Extract agreement/disagreement signals from text."""
#         text_lower = text.lower()
#         prop_lower = proposition.lower()
        
#         # Handle negations
#         processed_text, is_negated = self.handle_negations(text)
        
#         # Combine text and proposition for context-aware matching
#         combined_text = f"{prop_lower} {processed_text}"
        
#         signals = defaultdict(list)
        
#         # Check agreement markers
#         for level, markers in self.agreement_markers.items():
#             for pattern in markers['patterns']:
#                 matches = re.findall(pattern, combined_text, re.IGNORECASE)
#                 if matches:
#                     weight = markers['weight']
#                     if is_negated:
#                         weight = 1.0 - weight  # Flip due to negation
#                     signals['agreement'].extend([weight] * len(matches))
        
#         # Check disagreement markers
#         for level, markers in self.disagreement_markers.items():
#             for pattern in markers['patterns']:
#                 matches = re.findall(pattern, combined_text, re.IGNORECASE)
#                 if matches:
#                     weight = markers['weight']
#                     if is_negated:
#                         weight = 1.0 - weight  # Flip due to negation
#                     signals['disagreement'].extend([weight] * len(matches))
        
#         # Check neutral markers
#         for pattern in self.neutral_markers['patterns']:
#             matches = re.findall(pattern, combined_text, re.IGNORECASE)
#             if matches:
#                 signals['neutral'].extend([self.neutral_markers['weight']] * len(matches))
        
#         return signals

#     def calculate_semantic_similarity(self, text: str, proposition: str) -> float:
#         """
#         Simple semantic similarity based on word overlap and key terms.
#         In a production system, this would use embeddings.
#         """
#         # Tokenize and clean
#         def tokenize(s):
#             return set(re.findall(r'\b\w+\b', s.lower()))
        
#         text_tokens = tokenize(text)
#         prop_tokens = tokenize(proposition)
        
#         if not prop_tokens:
#             return 0.5
        
#         # Calculate overlap
#         intersection = text_tokens.intersection(prop_tokens)
#         union = text_tokens.union(prop_tokens)
        
#         if not union:
#             return 0.5
        
#         jaccard_similarity = len(intersection) / len(union)
        
#         # Boost for exact proposition mentions
#         prop_words = proposition.lower().split()
#         text_lower = text.lower()
        
#         exact_mentions = sum(1 for word in prop_words if word in text_lower)
#         mention_boost = min(exact_mentions / len(prop_words), 0.3)
        
#         return min(jaccard_similarity + mention_boost, 1.0)
    
#     def analyze_context_sentiment(self, text: str, proposition: str) -> float:
#         """
#         Analyze sentiment in context of the proposition.
#         Simple implementation - in production would use proper sentiment analysis.
#         """
#         text_lower = text.lower()
        
#         # Positive sentiment words
#         positive_words = ['good', 'great', 'excellent', 'wonderful', 'amazing', 'fantastic',
#                          'beneficial', 'helpful', 'valuable', 'important', 'necessary',
#                          'essential', 'vital', 'crucial', 'significant']
        
#         # Negative sentiment words
#         negative_words = ['bad', 'terrible', 'awful', 'horrible', 'wrong', 'harmful',
#                          'dangerous', 'problematic', 'concerning', 'worrying', 'alarming',
#                          'unnecessary', 'useless', 'pointless', 'wasteful']
        
#         positive_count = sum(1 for word in positive_words if word in text_lower)
#         negative_count = sum(1 for word in negative_words if word in text_lower)
        
#         if positive_count == 0 and negative_count == 0:
#             return 0.5
        
#         total = positive_count + negative_count
#         return positive_count / total
    
#     def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         """
#         Main method to measure belief alignment.
#         Returns belief score and detailed breakdown.
#         """
#         if not proposition.strip() or not text.strip():
#             return 0.5, {'error': 'Empty proposition or text', 'method': 'pattern'}
        
#         # Extract signals
#         signals = self.extract_agreement_signals(text, proposition)
        
#         # Calculate components
#         semantic_sim = self.calculate_semantic_similarity(text, proposition)
#         context_sentiment = self.analyze_context_sentiment(text, proposition)
        
#         # Aggregate agreement/disagreement signals
#         agreement_scores = signals.get('agreement', [])
#         disagreement_scores = signals.get('disagreement', [])
#         neutral_scores = signals.get('neutral', [])
        
#         # Calculate weighted averages
#         if agreement_scores:
#             avg_agreement = sum(agreement_scores) / len(agreement_scores)
#         else:
#             avg_agreement = None
            
#         if disagreement_scores:
#             avg_disagreement = sum(disagreement_scores) / len(disagreement_scores)
#         else:
#             avg_disagreement = None
        
#         # Combine signals with weights
#         components = []
#         weights = []
        
#         # Primary signals (explicit agreement/disagreement)
#         if avg_agreement is not None:
#             components.append(avg_agreement)
#             weights.append(0.5)  # High weight for explicit agreement
        
#         if avg_disagreement is not None:
#             components.append(avg_disagreement)
#             weights.append(0.5)  # High weight for explicit disagreement
        
#         # Secondary signals
#         if semantic_sim > 0:
#             components.append(0.5 + (semantic_sim - 0.5) * 0.3)  # Moderate influence
#             weights.append(0.2)
        
#         if context_sentiment != 0.5:
#             components.append(context_sentiment)
#             weights.append(0.2)
        
#         # Handle neutral signals
#         if neutral_scores:
#             components.append(0.5)
#             weights.append(0.3)
        
#         # Default to neutral if no signals
#         if not components:
#             final_score = 0.5
#         else:
#             # Weighted average
#             final_score = sum(c * w for c, w in zip(components, weights)) / sum(weights)
        
#         # Ensure score is in [0, 1]
#         final_score = max(0.0, min(1.0, final_score))
        
#         # Prepare detailed breakdown
#         breakdown = {
#             'final_score': final_score,
#             'method': 'pattern',
#             'semantic_similarity': semantic_sim,
#             'context_sentiment': context_sentiment,
#             'agreement_signals': len(agreement_scores),
#             'disagreement_signals': len(disagreement_scores),
#             'neutral_signals': len(neutral_scores),
#             'avg_agreement': avg_agreement,
#             'avg_disagreement': avg_disagreement,
#             'components_used': len(components)
#         }
        
#         return final_score, breakdown


# class BeliefMeasurementPipeline:
#     """
#     Unified pipeline supporting multiple belief measurement strategies:
#     - Pattern-based (regex + rules)
#     - Prompting-based (structured prompts)  
#     - LogProb-based (probability comparison)
#     - Ensemble (combination of multiple strategies)
#     """
    
#     def __init__(self, strategy: str = 'ensemble'):
#         """
#         Initialize pipeline with specified strategy.
        
#         Args:
#             strategy: 'pattern', 'prompting', 'logprob', or 'ensemble'
#         """
#         self.strategies = {
#             'pattern': PatternBasedStrategy(),
#             'prompting': PromptingBasedStrategy(), 
#             'logprob': LogProbBasedStrategy()
#         }
        
#         self.strategy_name = strategy
#         if strategy == 'ensemble':
#             self.active_strategies = list(self.strategies.values())
#         else:
#             self.active_strategies = [self.strategies[strategy]]
    
#     def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         """
#         Measure belief using the configured strategy/strategies.
#         """
#         if not proposition.strip() or not text.strip():
#             return 0.5, {'error': 'Empty proposition or text'}
        
#         if self.strategy_name == 'ensemble':
#             return self._ensemble_measure(proposition, text)
#         else:
#             return self.active_strategies[0].measure_belief(proposition, text)
    
#     def _ensemble_measure(self, proposition: str, text: str) -> Tuple[float, Dict]:
#         """Combine multiple strategies using ensemble approach."""
#         results = []
#         detailed_results = {}
        
#         # Get results from each strategy
#         for name, strategy in self.strategies.items():
#             try:
#                 score, breakdown = strategy.measure_belief(proposition, text)
#                 results.append(score)
#                 detailed_results[name] = {
#                     'score': score,
#                     'breakdown': breakdown
#                 }
#             except Exception as e:
#                 detailed_results[name] = {
#                     'error': str(e)
#                 }
        
#         if not results:
#             return 0.5, {'error': 'All strategies failed'}
        
#         # Weighted ensemble (can be adjusted based on strategy reliability)
#         weights = {
#             'pattern': 0.4,    # Good for explicit statements
#             'prompting': 0.4,  # Good for nuanced reasoning
#             'logprob': 0.2     # Good for implicit signals
#         }
        
#         weighted_score = 0
#         total_weight = 0
        
#         for i, (name, strategy) in enumerate(self.strategies.items()):
#             if name in detailed_results and 'score' in detailed_results[name]:
#                 weight = weights.get(name, 1.0 / len(self.strategies))
#                 weighted_score += detailed_results[name]['score'] * weight
#                 total_weight += weight
        
#         final_score = weighted_score / total_weight if total_weight > 0 else 0.5
        
#         # Calculate agreement between strategies (confidence measure)
#         if len(results) > 1:
#             mean_score = sum(results) / len(results)
#             variance = sum((s - mean_score) ** 2 for s in results) / len(results)
#             agreement = 1.0 - min(variance * 4, 1.0)  # Higher agreement = lower variance
#         else:
#             agreement = 1.0
        
#         breakdown = {
#             'final_score': final_score,
#             'method': 'ensemble',
#             'strategy_results': detailed_results,
#             'individual_scores': results,
#             'agreement_score': agreement,
#             'weights_used': weights
#         }
        
#         return final_score, breakdown


# def interactive_mode():
#     """Interactive mode for testing the pipeline."""
#     print("Belief Measurement Pipeline - Interactive Mode")
#     print("=" * 50)
#     print("Available strategies: pattern, prompting, logprob, ensemble")
#     print("Type 'quit' to exit")
#     print()
    
#     while True:
#         strategy = input("Select strategy (default: ensemble): ").strip()
#         if strategy.lower() == 'quit':
#             break
#         if not strategy:
#             strategy = 'ensemble'
        
#         if strategy not in ['pattern', 'prompting', 'logprob', 'ensemble']:
#             print("Invalid strategy. Please choose from: pattern, prompting, logprob, ensemble")
#             continue
        
#         pipeline = BeliefMeasurementPipeline(strategy=strategy)
        
#         print(f"\nUsing {strategy} strategy")
#         proposition = input("Enter proposition: ").strip()
#         if proposition.lower() == 'quit':
#             break
        
#         text = input("Enter text: ").strip()
#         if text.lower() == 'quit':
#             break
        
#         if proposition and text:
#             score, breakdown = pipeline.measure_belief(proposition, text)
#             print(f"\nBelief Score: {score:.3f}")
#             print(f"Method: {breakdown.get('method', 'unknown')}")
            
#             if strategy == 'ensemble':
#                 print("\nStrategy Breakdown:")
#                 for strat_name, result in breakdown.get('strategy_results', {}).items():
#                     if 'score' in result:
#                         print(f"  {strat_name}: {result['score']:.3f}")
#                 print(f"Agreement Score: {breakdown.get('agreement_score', 0):.3f}")
            
#             print("-" * 30)
#         else:
#             print("Please provide both proposition and text.")


# # Test all strategies
# def comprehensive_test():
#     """Test all strategies on the same test cases."""
#     test_cases = [
#         {
#             'name': 'Strong Agreement',
#             'proposition': 'Climate change is a serious threat',
#             'text': 'I absolutely agree that climate change poses a significant danger to our planet and requires immediate action.',
#             'expected_range': (0.8, 1.0)
#         },
#         {
#             'name': 'Strong Disagreement', 
#             'proposition': 'Vaccines are dangerous',
#             'text': 'I completely disagree. Vaccines are safe, effective, and this claim is utterly false according to scientific evidence.',
#             'expected_range': (0.0, 0.2)
#         },
#         {
#             'name': 'Qualified Agreement',
#             'proposition': 'Remote work increases productivity',
#             'text': 'I think remote work can increase productivity for some people, but it depends on the individual and the type of work.',
#             'expected_range': (0.5, 0.8)
#         },
#         {
#             'name': 'Neutral/Irrelevant',
#             'proposition': 'Pizza is the best food',
#             'text': 'I went to the store yesterday and bought some groceries. The weather was nice and I saw a dog.',
#             'expected_range': (0.4, 0.6)
#         },
#         {
#             'name': 'Subtle Disagreement',
#             'proposition': 'Social media improves mental health',
#             'text': 'While social media connects people, research suggests it might actually increase anxiety and depression rather than helping.',
#             'expected_range': (0.2, 0.5)
#         }
#     ]
    
#     strategies = ['pattern', 'prompting', 'logprob', 'ensemble']
    
#     print("Comprehensive Strategy Comparison")
#     print("=" * 60)
    
#     for case in test_cases:
#         print(f"\nTest Case: {case['name']}")
#         print(f"Proposition: {case['proposition']}")
#         print(f"Text: {case['text']}")
#         print(f"Expected Range: {case['expected_range']}")
#         print("-" * 40)
        
#         for strategy in strategies:
#             pipeline = BeliefMeasurementPipeline(strategy=strategy)
#             score, breakdown = pipeline.measure_belief(case['proposition'], case['text'])
            
#             in_range = case['expected_range'][0] <= score <= case['expected_range'][1]
#             status = "✓ PASS" if in_range else "✗ REVIEW"
            
#             print(f"{strategy:>12}: {score:.3f} {status}")
            
#             # Show key insights for ensemble
#             if strategy == 'ensemble' and 'strategy_results' in breakdown:
#                 individual_scores = []
#                 for strat_name, result in breakdown['strategy_results'].items():
#                     if 'score' in result:
#                         individual_scores.append(f"{strat_name}: {result['score']:.3f}")
#                 print(f"            Individual: {', '.join(individual_scores)}")
#                 print(f"            Agreement: {breakdown.get('agreement_score', 0):.3f}")
        
#         print()


# def strategy_analysis():
#     """Analyze strengths and weaknesses of each strategy."""
#     pipeline_pattern = BeliefMeasurementPipeline('pattern')
#     pipeline_prompting = BeliefMeasurementPipeline('prompting') 
#     pipeline_logprob = BeliefMeasurementPipeline('logprob')
    
#     analysis_cases = [
#         {
#             'name': 'Explicit Agreement',
#             'proposition': 'Education is important',
#             'text': 'I strongly agree that education is crucial for society.',
#             'expected_best': 'pattern'  # Should excel at explicit markers
#         },
#         {
#             'name': 'Implicit Reasoning',
#             'proposition': 'Exercise prevents disease',
#             'text': 'Regular physical activity strengthens the immune system, improves cardiovascular health, and reduces inflammation.',
#             'expected_best': 'prompting'  # Should excel at implicit reasoning
#         },
#         {
#             'name': 'Ambiguous Context',
#             'proposition': 'Technology makes life better',
#             'text': 'Technology has transformed how we work and communicate, though it also creates new challenges.',
#             'expected_best': 'logprob'  # May handle uncertainty well
#         }
#     ]
    
#     print("\nStrategy Analysis")
#     print("=" * 40)
    
#     for case in analysis_cases:
#         print(f"\nCase: {case['name']}")
#         print(f"Expected best strategy: {case['expected_best']}")
        
#         scores = {}
#         scores['pattern'], _ = pipeline_pattern.measure_belief(case['proposition'], case['text'])
#         scores['prompting'], _ = pipeline_prompting.measure_belief(case['proposition'], case['text'])  
#         scores['logprob'], _ = pipeline_logprob.measure_belief(case['proposition'], case['text'])
        
#         # Find best performing strategy
#         best_strategy = max(scores.items(), key=lambda x: abs(x[1] - 0.5))  # Most confident
#         print(f"Results: {scores}")
#         print(f"Most confident: {best_strategy[0]} ({best_strategy[1]:.3f})")
#         print(f"Matches expectation: {best_strategy[0] == case['expected_best']}")


# def demo_examples():
#     """Demonstrate the pipeline with example use cases."""
#     print("Belief Measurement Pipeline - Demo Examples")
#     print("=" * 50)
    
#     examples = [
#         {
#             'proposition': 'Artificial intelligence will benefit humanity',
#             'text': 'AI has tremendous potential to solve complex problems in healthcare, climate change, and education. While there are risks to manage, the benefits far outweigh the concerns when developed responsibly.'
#         },
#         {
#             'proposition': 'Working from home is more productive',
#             'text': 'I completely disagree with remote work being more productive. In my experience, office collaboration and direct supervision lead to much better results and team cohesion.'
#         },
#         {
#             'proposition': 'Electric cars are the future of transportation',
#             'text': 'Electric vehicles are becoming increasingly popular and the technology is improving rapidly. However, there are still challenges with charging infrastructure and battery costs that need to be addressed.'
#         }
#     ]
    
#     pipeline = BeliefMeasurementPipeline('ensemble')
    
#     for i, example in enumerate(examples, 1):
#         print(f"\nExample {i}:")
#         print(f"Proposition: {example['proposition']}")
#         print(f"Text: {example['text']}")
        
#         score, breakdown = pipeline.measure_belief(example['proposition'], example['text'])
        
#         print(f"\nOverall Belief Score: {score:.3f}")
        
#         # Interpret the score
#         if score >= 0.8:
#             interpretation = "Strong Agreement"
#         elif score >= 0.6:
#             interpretation = "Agreement"
#         elif score >= 0.4:
#             interpretation = "Neutral/Mixed"
#         elif score >= 0.2:
#             interpretation = "Disagreement"
#         else:
#             interpretation = "Strong Disagreement"
        
#         print(f"Interpretation: {interpretation}")
        
#         # Show individual strategy scores
#         print("Individual Strategy Scores:")
#         for strategy_name, result in breakdown.get('strategy_results', {}).items():
#             if 'score' in result:
#                 print(f"  {strategy_name.capitalize()}: {result['score']:.3f}")
        
#         print(f"Strategy Agreement: {breakdown.get('agreement_score', 0):.3f}")
#         print("-" * 50)


# def performance_test():
#     """Test pipeline performance with various edge cases."""
#     print("Performance Test - Edge Cases")
#     print("=" * 40)
    
#     edge_cases = [
#         {
#             'name': 'Empty strings',
#             'proposition': '',
#             'text': ''
#         },
#         {
#             'name': 'Very short text',
#             'proposition': 'Short test',
#             'text': 'Yes.'
#         },
#         {
#             'name': 'Long complex text',
#             'proposition': 'Democracy is important',
#             'text': '''Democracy, while not perfect, represents the best system of governance we have developed. 
#                       It allows for peaceful transitions of power, protects minority rights through constitutional frameworks,
#                       and provides mechanisms for citizens to hold leaders accountable. However, it requires active 
#                       participation from informed citizens and strong institutions to function effectively. Recent challenges
#                       to democratic norms in various countries highlight the fragility of these systems and the need for
#                       constant vigilance to protect democratic values and processes.'''
#         },
#         {
#             'name': 'Contradictory statements',
#             'proposition': 'Exercise is good',
#             'text': 'Exercise is definitely beneficial for health. However, I completely disagree that exercise is good because it can cause injuries.'
#         },
#         {
#             'name': 'Multiple negations',
#             'proposition': 'Smoking is harmful',
#             'text': 'I do not think that smoking is not dangerous. It is not true that smoking does not cause health problems.'
#         }
#     ]
    
#     pipeline = BeliefMeasurementPipeline('ensemble')
    
#     for case in edge_cases:
#         print(f"\nTest: {case['name']}")
#         print(f"Proposition: '{case['proposition']}'")
#         print(f"Text: '{case['text'][:100]}{'...' if len(case['text']) > 100 else ''}'")
        
#         try:
#             score, breakdown = pipeline.measure_belief(case['proposition'], case['text'])
#             print(f"Score: {score:.3f}")
#             print(f"Status: {'✓ Success' if 'error' not in breakdown else '✗ Error: ' + breakdown['error']}")
#         except Exception as e:
#             print(f"Status: ✗ Exception: {str(e)}")


# def save_results_to_file(results: Dict, filename: str = 'belief_measurement_results.json'):
#     """Save test results to a JSON file for analysis."""
#     try:
#         with open(filename, 'w') as f:
#             json.dump(results, f, indent=2)
#         print(f"Results saved to {filename}")
#     except Exception as e:
#         print(f"Error saving results: {e}")


# def batch_analysis(test_data: List[Dict]) -> Dict:
#     """Run batch analysis on multiple test cases."""
#     print("Running Batch Analysis...")
    
#     results = {
#         'summary': {},
#         'detailed_results': [],
#         'strategy_performance': {}
#     }
    
#     strategies = ['pattern', 'prompting', 'logprob', 'ensemble']
#     strategy_scores = {strategy: [] for strategy in strategies}
    
#     for i, test_case in enumerate(test_data):
#         case_results = {
#             'case_id': i + 1,
#             'proposition': test_case['proposition'],
#             'text': test_case['text'],
#             'strategy_scores': {}
#         }
        
#         for strategy in strategies:
#             pipeline = BeliefMeasurementPipeline(strategy=strategy)
#             score, breakdown = pipeline.measure_belief(test_case['proposition'], test_case['text'])
            
#             case_results['strategy_scores'][strategy] = {
#                 'score': score,
#                 'method': breakdown.get('method', strategy)
#             }
#             strategy_scores[strategy].append(score)
        
#         results['detailed_results'].append(case_results)
    
#     # Calculate summary statistics
#     for strategy in strategies:
#         scores = strategy_scores[strategy]
#         if scores:
#             results['strategy_performance'][strategy] = {
#                 'mean_score': sum(scores) / len(scores),
#                 'min_score': min(scores),
#                 'max_score': max(scores),
#                 'score_variance': sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores)
#             }
    
#     results['summary'] = {
#         'total_cases': len(test_data),
#         'strategies_tested': strategies,
#         'most_confident_strategy': min(results['strategy_performance'].items(), 
#                                      key=lambda x: x[1]['score_variance'])[0]
#     }
    
#     return results


# if __name__ == "__main__":
#     import sys
    
#     if len(sys.argv) > 1:
#         command = sys.argv[1].lower()
        
#         if command == 'interactive':
#             interactive_mode()
#         elif command == 'comprehensive':
#             comprehensive_test()
#         elif command == 'analysis':
#             strategy_analysis()
#         elif command == 'demo':
#             demo_examples()
#         elif command == 'performance':
#             performance_test()
#         elif command == 'all':
#             print("Running all tests...\n")
#             comprehensive_test()
#             strategy_analysis()
#             demo_examples()
#             performance_test()
#         else:
#             print("Available commands:")
#             print("  interactive  - Interactive mode for custom testing")
#             print("  comprehensive - Run comprehensive test suite")
#             print("  analysis     - Strategy analysis and comparison")
#             print("  demo         - Demonstrate with example use cases")
#             print("  performance  - Test edge cases and performance")
#             print("  all          - Run all tests")
#             print("\nUsage: python script.py [command]")
#     else:
#         # Default behavior - run comprehensive test and analysis
#         print("Belief Measurement Pipeline")
#         print("=" * 30)
#         print("Running default test suite...\n")
        
#         comprehensive_test()
#         strategy_analysis()
        
#         print("\nFor more options, run: python script.py [command]")
#         print("Use 'python script.py interactive' for interactive testing")
        

Available commands:
  interactive  - Interactive mode for custom testing
  comprehensive - Run comprehensive test suite
  analysis     - Strategy analysis and comparison
  demo         - Demonstrate with example use cases
  performance  - Test edge cases and performance
  all          - Run all tests

Usage: python script.py [command]


# Scaling the project to all 3 dimension (prompting-based,logprob-based, embedding-based)

In [5]:
import re
import math
import json
from typing import List, Tuple, Dict, Optional
from collections import defaultdict
from abc import ABC, abstractmethod

# Abstract base class for different pipeline approaches
class BeliefMeasurementStrategy(ABC):
    @abstractmethod
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        pass

class PromptingBasedStrategy(BeliefMeasurementStrategy):
    """
    Prompting-based approach that uses structured prompts to evaluate belief alignment.
    Simulates LLM reasoning through rule-based prompt interpretation.
    """
    
    def __init__(self):
        self.prompt_templates = {
            'direct_assessment': """
            Analyze the following text to determine how much it agrees with the given proposition.
            
            Proposition: "{proposition}"
            Text: "{text}"
            
            Consider:
            1. Does the text explicitly agree or disagree with the proposition?
            2. What is the strength of the agreement/disagreement?
            3. Are there qualifying statements that moderate the position?
            4. Is the text neutral or irrelevant to the proposition?
            
            Rate agreement on scale 0-10 where:
            0-1: Strong disagreement
            2-3: Disagreement  
            4-6: Neutral/Mixed
            7-8: Agreement
            9-10: Strong agreement
            """,
            
            'comparative_assessment': """
            Compare how much the text supports vs opposes the proposition:
            
            Proposition: "{proposition}"
            Text: "{text}"
            
            Evidence FOR the proposition in the text:
            Evidence AGAINST the proposition in the text:
            
            Overall assessment (0-10 scale):
            """,
            
            'reasoning_chain': """
            Step-by-step analysis:
            
            Proposition: "{proposition}"
            Text: "{text}"
            
            Step 1: What is the main claim in the text?
            Step 2: How does this claim relate to the proposition?
            Step 3: What supporting evidence is provided?
            Step 4: Are there any contradictions or qualifications?
            Step 5: Overall belief alignment (0-10):
            """
        }
    
    def _simulate_llm_response(self, prompt: str, proposition: str, text: str) -> Dict:
        """
        Simulate LLM response using rule-based reasoning.
        In production, this would call an actual LLM API.
        """
        text_lower = text.lower()
        prop_lower = proposition.lower()
        
        # Extract key phrases from proposition
        prop_keywords = set(re.findall(r'\b\w+\b', prop_lower))
        text_keywords = set(re.findall(r'\b\w+\b', text_lower))
        
        keyword_overlap = len(prop_keywords.intersection(text_keywords)) / max(len(prop_keywords), 1)
        
        # Analyze explicit stances
        strong_positive = any(phrase in text_lower for phrase in [
            'strongly agree', 'completely agree', 'absolutely', 'definitely true',
            'without doubt', 'certainly', 'undoubtedly'
        ])
        
        positive = any(phrase in text_lower for phrase in [
            'agree', 'support', 'believe', 'yes', 'correct', 'true', 'right'
        ])
        
        strong_negative = any(phrase in text_lower for phrase in [
            'strongly disagree', 'completely wrong', 'absolutely not', 'definitely false',
            'never', 'utterly false', 'completely disagree'
        ])
        
        negative = any(phrase in text_lower for phrase in [
            'disagree', 'oppose', 'no', 'false', 'wrong', 'incorrect', 'dispute'
        ])
        
        neutral = any(phrase in text_lower for phrase in [
            'neutral', 'mixed', 'depends', 'complicated', 'unsure', 'both sides'
        ])
        
        # Simulate reasoning
        reasoning = {
            'keyword_overlap': keyword_overlap,
            'explicit_stance': None,
            'confidence': 0.5,
            'evidence_for': [],
            'evidence_against': [],
            'qualifications': []
        }
        
        # Determine stance
        if strong_positive:
            reasoning['explicit_stance'] = 'strong_agree'
            reasoning['confidence'] = 0.9
            reasoning['score'] = 9
        elif positive:
            reasoning['explicit_stance'] = 'agree'
            reasoning['confidence'] = 0.7
            reasoning['score'] = 7
        elif strong_negative:
            reasoning['explicit_stance'] = 'strong_disagree'
            reasoning['confidence'] = 0.9
            reasoning['score'] = 1
        elif negative:
            reasoning['explicit_stance'] = 'disagree'
            reasoning['confidence'] = 0.7
            reasoning['score'] = 3
        elif neutral:
            reasoning['explicit_stance'] = 'neutral'
            reasoning['confidence'] = 0.6
            reasoning['score'] = 5
        else:
            # Use keyword overlap and context
            if keyword_overlap > 0.5:
                reasoning['score'] = 5 + int(keyword_overlap * 3)
            else:
                reasoning['score'] = 5  # Default neutral
        
        # Adjust based on keyword overlap
        if reasoning.get('score'):
            if keyword_overlap < 0.2:
                reasoning['score'] = max(4, min(6, reasoning['score']))  # Force toward neutral
        
        return reasoning
    
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """Measure belief using prompting-based approach."""
        if not proposition.strip() or not text.strip():
            return 0.5, {'error': 'Empty proposition or text', 'method': 'prompting'}
        
        # Use multiple prompt templates for robust assessment
        results = []
        detailed_analysis = {}
        
        for template_name, template in self.prompt_templates.items():
            prompt = template.format(proposition=proposition, text=text)
            response = self._simulate_llm_response(prompt, proposition, text)
            
            if 'score' in response:
                # Convert 0-10 scale to 0-1
                normalized_score = response['score'] / 10.0
                results.append(normalized_score)
                detailed_analysis[template_name] = response
        
        # Ensemble the results
        if results:
            final_score = sum(results) / len(results)
        else:
            final_score = 0.5
        
        # Ensure score is in [0, 1]
        final_score = max(0.0, min(1.0, final_score))
        
        breakdown = {
            'final_score': final_score,
            'method': 'prompting',
            'individual_scores': results,
            'detailed_analysis': detailed_analysis,
            'num_prompts_used': len(results)
        }
        
        return final_score, breakdown


class LogProbBasedStrategy(BeliefMeasurementStrategy):
    """
    LogProb-based approach that estimates belief by comparing log probabilities
    of agreement vs disagreement continuations.
    """
    
    def __init__(self):
        # Simulated vocabulary with log probabilities
        # In production, this would use actual model logprobs
        self.vocab_logprobs = self._build_simulated_vocab()
        
        # Template continuations to test
        self.agreement_templates = [
            "This statement is {token}",
            "I {token} with this",
            "This is {token}",
            "The text {token} the proposition"
        ]
        
        self.agreement_tokens = ['true', 'correct', 'right', 'accurate', 'agree', 'support']
        self.disagreement_tokens = ['false', 'wrong', 'incorrect', 'disagree', 'oppose', 'dispute']
        self.neutral_tokens = ['unclear', 'mixed', 'neutral', 'uncertain']
    
    def _build_simulated_vocab(self) -> Dict[str, float]:
        """Build simulated vocabulary with log probabilities."""
        # Simulate log probabilities for common words
        # In production, these would come from actual model
        vocab = {
            'true': -1.5, 'false': -1.8, 'correct': -2.1, 'wrong': -1.9,
            'right': -1.7, 'incorrect': -2.3, 'accurate': -2.8,
            'agree': -1.6, 'disagree': -2.0, 'support': -2.2, 'oppose': -2.5,
            'dispute': -3.1, 'unclear': -2.7, 'mixed': -2.9, 'neutral': -2.4,
            'uncertain': -2.8, 'definitely': -2.3, 'probably': -1.8,
            'maybe': -2.1, 'never': -2.2, 'always': -2.4
        }
        return vocab
    
    def _calculate_context_logprob(self, context: str, token: str) -> float:
        """
        Calculate log probability of token given context.
        Simulates model behavior based on context analysis.
        """
        base_logprob = self.vocab_logprobs.get(token, -4.0)  # Default for unknown tokens
        
        context_lower = context.lower()
        
        # Boost probability if context supports the token
        if token in ['true', 'correct', 'right', 'accurate', 'agree', 'support']:
            if any(phrase in context_lower for phrase in ['i think', 'believe', 'support', 'agree']):
                base_logprob += 1.0
            elif any(phrase in context_lower for phrase in ['disagree', 'oppose', 'wrong']):
                base_logprob -= 1.5
        
        elif token in ['false', 'wrong', 'incorrect', 'disagree', 'oppose']:
            if any(phrase in context_lower for phrase in ['disagree', 'oppose', 'wrong']):
                base_logprob += 1.0
            elif any(phrase in context_lower for phrase in ['agree', 'support', 'correct']):
                base_logprob -= 1.5
        
        # Context relevance boost
        prop_words = set(re.findall(r'\b\w+\b', context_lower))
        if token in prop_words:
            base_logprob += 0.5
        
        return base_logprob
    
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """Measure belief using log probability approach."""
        if not proposition.strip() or not text.strip():
            return 0.5, {'error': 'Empty proposition or text', 'method': 'logprob'}
        
        # Create context by combining proposition and text
        context = f"Proposition: {proposition}\nText: {text}\n"
        
        # Calculate log probabilities for different token types
        agreement_logprobs = []
        disagreement_logprobs = []
        neutral_logprobs = []
        
        for template in self.agreement_templates:
            for token in self.agreement_tokens:
                filled_template = template.format(token=token)
                full_context = context + filled_template
                logprob = self._calculate_context_logprob(full_context, token)
                agreement_logprobs.append(logprob)
            
            for token in self.disagreement_tokens:
                filled_template = template.format(token=token)
                full_context = context + filled_template
                logprob = self._calculate_context_logprob(full_context, token)
                disagreement_logprobs.append(logprob)
            
            for token in self.neutral_tokens:
                filled_template = template.format(token=token)
                full_context = context + filled_template
                logprob = self._calculate_context_logprob(full_context, token)
                neutral_logprobs.append(logprob)
        
        # Calculate average log probabilities
        avg_agreement = sum(agreement_logprobs) / len(agreement_logprobs) if agreement_logprobs else -4.0
        avg_disagreement = sum(disagreement_logprobs) / len(disagreement_logprobs) if disagreement_logprobs else -4.0
        avg_neutral = sum(neutral_logprobs) / len(neutral_logprobs) if neutral_logprobs else -4.0
        
        # Convert to probabilities using softmax
        logprobs = [avg_agreement, avg_disagreement, avg_neutral]
        max_logprob = max(logprobs)
        normalized_logprobs = [lp - max_logprob for lp in logprobs]
        
        exp_probs = [math.exp(lp) for lp in normalized_logprobs]
        sum_exp = sum(exp_probs)
        
        prob_agreement = exp_probs[0] / sum_exp
        prob_disagreement = exp_probs[1] / sum_exp
        prob_neutral = exp_probs[2] / sum_exp
        
        # Convert to belief score (0 = disagree, 0.5 = neutral, 1 = agree)
        belief_score = prob_agreement + 0.5 * prob_neutral
        
        breakdown = {
            'final_score': belief_score,
            'method': 'logprob',
            'prob_agreement': prob_agreement,
            'prob_disagreement': prob_disagreement,
            'prob_neutral': prob_neutral,
            'avg_agreement_logprob': avg_agreement,
            'avg_disagreement_logprob': avg_disagreement,
            'avg_neutral_logprob': avg_neutral,
            'num_templates': len(self.agreement_templates)
        }
        
        return belief_score, breakdown

class PatternBasedStrategy(BeliefMeasurementStrategy):
    """
    Original pattern-based approach using regex patterns and rule-based analysis.
    A pipeline to measure belief alignment between a proposition and free-form text.
    Returns a score in [0,1] where 0=strongly disagrees, 0.5=neutral, 1=strongly agrees.
    """
    
    def __init__(self):
        # Agreement indicators with weights
        self.agreement_markers = {
            'strong_agree': {
                'patterns': [r'\b(absolutely|completely|totally|entirely|fully)\s+(agree|support|believe|endorse)',
                           r'\b(strongly|firmly)\s+(agree|support|believe|endorse)',
                           r'\b(without\s+doubt|undoubtedly|certainly|definitely)\b.*\b(true|correct|right)\b',
                           r'\bI\s+(completely|totally|absolutely|strongly)\s+(agree|support|believe)\b'],
                'weight': 0.9
            },
            'agree': {
                'patterns': [r'\b(agree|support|believe|endorse|accept|embrace)\b',
                           r'\b(yes|correct|true|right|accurate|valid)\b',
                           r'\bI\s+(think|believe|feel)\s+.*\b(true|correct|right)\b'],
                'weight': 0.7
            },
            'weak_agree': {
                'patterns': [r'\b(somewhat|partially|largely|mostly|generally)\s+(agree|support|believe)',
                           r'\b(tend\s+to|inclined\s+to)\s+(agree|believe|support)',
                           r'\b(probably|likely)\s+(true|correct|right)\b'],
                'weight': 0.6
            }
        }
        
        # Disagreement indicators
        self.disagreement_markers = {
            'strong_disagree': {
                'patterns': [r'\b(absolutely|completely|totally|entirely|strongly)\s+(disagree|oppose|reject)',
                           r'\b(never|not\s+at\s+all|in\s+no\s+way)\b',
                           r'\b(completely|utterly|totally)\s+(false|wrong|incorrect)',
                           r'\bI\s+(completely|totally|absolutely|strongly)\s+(disagree|oppose|reject)\b'],
                'weight': 0.1
            },
            'disagree': {
                'patterns': [r'\b(disagree|oppose|reject|deny|refute|dispute)\b',
                           r'\b(no|false|wrong|incorrect|invalid|untrue)\b',
                           r'\bI\s+(don\'?t|do\s+not)\s+(agree|believe|support|think)\b'],
                'weight': 0.3
            },
            'weak_disagree': {
                'patterns': [r'\b(somewhat|partially)\s+(disagree|oppose)',
                           r'\b(doubt|question|skeptical|uncertain)\b',
                           r'\b(probably|likely)\s+(false|wrong|incorrect)\b'],
                'weight': 0.4
            }
        }
        
        # Neutral indicators
        self.neutral_markers = {
            'patterns': [r'\b(neutral|undecided|unsure|uncertain|mixed\s+feelings)\b',
                        r'\b(both\s+sides|on\s+one\s+hand.*on\s+the\s+other)\b',
                        r'\b(depends|it\'?s\s+complicated|not\s+sure)\b'],
                'weight': 0.5
        }
        
    def handle_negations(self, text: str) -> Tuple[str, bool]:
        """Handle negations that might flip the meaning."""
        # Simple negation patterns
        negation_patterns = [
            (r'\b(do\s+not|don\'t|does\s+not|doesn\'t|did\s+not|didn\'t)\s+', ''),
            (r'\b(not|never|no)\s+', ''),
            (r'\b(isn\'t|aren\'t|wasn\'t|weren\'t|won\'t|wouldn\'t|can\'t|couldn\'t)\b', ''),
        ]
        
        negation_count = 0
        processed_text = text.lower()
        
        for pattern, replacement in negation_patterns:
            matches = re.findall(pattern, processed_text, re.IGNORECASE)
            negation_count += len(matches)
            processed_text = re.sub(pattern, replacement, processed_text, flags=re.IGNORECASE)
        
        return processed_text, negation_count % 2 == 1  # Odd number of negations = flipped

    def extract_agreement_signals(self, text: str, proposition: str) -> Dict[str, List[float]]:
        """Extract agreement/disagreement signals from text."""
        text_lower = text.lower()
        prop_lower = proposition.lower()
        
        # Handle negations
        processed_text, is_negated = self.handle_negations(text)
        
        # Combine text and proposition for context-aware matching
        combined_text = f"{prop_lower} {processed_text}"
        
        signals = defaultdict(list)
        
        # Check agreement markers
        for level, markers in self.agreement_markers.items():
            for pattern in markers['patterns']:
                matches = re.findall(pattern, combined_text, re.IGNORECASE)
                if matches:
                    weight = markers['weight']
                    if is_negated:
                        weight = 1.0 - weight  # Flip due to negation
                    signals['agreement'].extend([weight] * len(matches))
        
        # Check disagreement markers
        for level, markers in self.disagreement_markers.items():
            for pattern in markers['patterns']:
                matches = re.findall(pattern, combined_text, re.IGNORECASE)
                if matches:
                    weight = markers['weight']
                    if is_negated:
                        weight = 1.0 - weight  # Flip due to negation
                    signals['disagreement'].extend([weight] * len(matches))
        
        # Check neutral markers
        for pattern in self.neutral_markers['patterns']:
            matches = re.findall(pattern, combined_text, re.IGNORECASE)
            if matches:
                signals['neutral'].extend([self.neutral_markers['weight']] * len(matches))
        
        return signals

    def calculate_semantic_similarity(self, text: str, proposition: str) -> float:
        """
        Simple semantic similarity based on word overlap and key terms.
        In a production system, this would use embeddings.
        """
        # Tokenize and clean
        def tokenize(s):
            return set(re.findall(r'\b\w+\b', s.lower()))
        
        text_tokens = tokenize(text)
        prop_tokens = tokenize(proposition)
        
        if not prop_tokens:
            return 0.5
        
        # Calculate overlap
        intersection = text_tokens.intersection(prop_tokens)
        union = text_tokens.union(prop_tokens)
        
        if not union:
            return 0.5
        
        jaccard_similarity = len(intersection) / len(union)
        
        # Boost for exact proposition mentions
        prop_words = proposition.lower().split()
        text_lower = text.lower()
        
        exact_mentions = sum(1 for word in prop_words if word in text_lower)
        mention_boost = min(exact_mentions / len(prop_words), 0.3)
        
        return min(jaccard_similarity + mention_boost, 1.0)
    
    def analyze_context_sentiment(self, text: str, proposition: str) -> float:
        """
        Analyze sentiment in context of the proposition.
        Simple implementation - in production would use proper sentiment analysis.
        """
        text_lower = text.lower()
        
        # Positive sentiment words
        positive_words = ['good', 'great', 'excellent', 'wonderful', 'amazing', 'fantastic',
                         'beneficial', 'helpful', 'valuable', 'important', 'necessary',
                         'essential', 'vital', 'crucial', 'significant']
        
        # Negative sentiment words
        negative_words = ['bad', 'terrible', 'awful', 'horrible', 'wrong', 'harmful',
                         'dangerous', 'problematic', 'concerning', 'worrying', 'alarming',
                         'unnecessary', 'useless', 'pointless', 'wasteful']
        
        positive_count = sum(1 for word in positive_words if word in text_lower)
        negative_count = sum(1 for word in negative_words if word in text_lower)
        
        if positive_count == 0 and negative_count == 0:
            return 0.5
        
        total = positive_count + negative_count
        return positive_count / total
    
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """
        Main method to measure belief alignment.
        Returns belief score and detailed breakdown.
        """
        if not proposition.strip() or not text.strip():
            return 0.5, {'error': 'Empty proposition or text', 'method': 'pattern'}
        
        # Extract signals
        signals = self.extract_agreement_signals(text, proposition)
        
        # Calculate components
        semantic_sim = self.calculate_semantic_similarity(text, proposition)
        context_sentiment = self.analyze_context_sentiment(text, proposition)
        
        # Aggregate agreement/disagreement signals
        agreement_scores = signals.get('agreement', [])
        disagreement_scores = signals.get('disagreement', [])
        neutral_scores = signals.get('neutral', [])
        
        # Calculate weighted averages
        if agreement_scores:
            avg_agreement = sum(agreement_scores) / len(agreement_scores)
        else:
            avg_agreement = None
            
        if disagreement_scores:
            avg_disagreement = sum(disagreement_scores) / len(disagreement_scores)
        else:
            avg_disagreement = None
        
        # Combine signals with weights
        components = []
        weights = []
        
        # Primary signals (explicit agreement/disagreement)
        if avg_agreement is not None:
            components.append(avg_agreement)
            weights.append(0.5)  # High weight for explicit agreement
        
        if avg_disagreement is not None:
            components.append(avg_disagreement)
            weights.append(0.5)  # High weight for explicit disagreement
        
        # Secondary signals
        if semantic_sim > 0:
            components.append(0.5 + (semantic_sim - 0.5) * 0.3)  # Moderate influence
            weights.append(0.2)
        
        if context_sentiment != 0.5:
            components.append(context_sentiment)
            weights.append(0.2)
        
        # Handle neutral signals
        if neutral_scores:
            components.append(0.5)
            weights.append(0.3)
        
        # Default to neutral if no signals
        if not components:
            final_score = 0.5
        else:
            # Weighted average
            final_score = sum(c * w for c, w in zip(components, weights)) / sum(weights)
        
        # Ensure score is in [0, 1]
        final_score = max(0.0, min(1.0, final_score))
        
        # Prepare detailed breakdown
        breakdown = {
            'final_score': final_score,
            'method': 'pattern',
            'semantic_similarity': semantic_sim,
            'context_sentiment': context_sentiment,
            'agreement_signals': len(agreement_scores),
            'disagreement_signals': len(disagreement_scores),
            'neutral_signals': len(neutral_scores),
            'avg_agreement': avg_agreement,
            'avg_disagreement': avg_disagreement,
            'components_used': len(components)
        }
        
        return final_score, breakdown


class BeliefMeasurementPipeline:
    """
    Unified pipeline supporting multiple belief measurement strategies:
    - Pattern-based (regex + rules)
    - Prompting-based (structured prompts)  
    - LogProb-based (probability comparison)
    - Ensemble (combination of multiple strategies)
    """
    
    def __init__(self, strategy: str = 'ensemble'):
        """
        Initialize pipeline with specified strategy.
        
        Args:
            strategy: 'pattern', 'prompting', 'logprob', or 'ensemble'
        """
        self.strategies = {
            'pattern': PatternBasedStrategy(),
            'prompting': PromptingBasedStrategy(), 
            'logprob': LogProbBasedStrategy()
        }
        
        self.strategy_name = strategy
        if strategy == 'ensemble':
            self.active_strategies = list(self.strategies.values())
        else:
            self.active_strategies = [self.strategies[strategy]]
    
    def measure_belief(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """
        Measure belief using the configured strategy/strategies.
        """
        if not proposition.strip() or not text.strip():
            return 0.5, {'error': 'Empty proposition or text'}
        
        if self.strategy_name == 'ensemble':
            return self._ensemble_measure(proposition, text)
        else:
            return self.active_strategies[0].measure_belief(proposition, text)
    
    def _ensemble_measure(self, proposition: str, text: str) -> Tuple[float, Dict]:
        """Combine multiple strategies using ensemble approach."""
        results = []
        detailed_results = {}
        
        # Get results from each strategy
        for name, strategy in self.strategies.items():
            try:
                score, breakdown = strategy.measure_belief(proposition, text)
                results.append(score)
                detailed_results[name] = {
                    'score': score,
                    'breakdown': breakdown
                }
            except Exception as e:
                detailed_results[name] = {
                    'error': str(e)
                }
        
        if not results:
            return 0.5, {'error': 'All strategies failed'}
        
        # Weighted ensemble (can be adjusted based on strategy reliability)
        weights = {
            'pattern': 0.4,    # Good for explicit statements
            'prompting': 0.4,  # Good for nuanced reasoning
            'logprob': 0.2     # Good for implicit signals
        }
        
        weighted_score = 0
        total_weight = 0
        
        for i, (name, strategy) in enumerate(self.strategies.items()):
            if name in detailed_results and 'score' in detailed_results[name]:
                weight = weights.get(name, 1.0 / len(self.strategies))
                weighted_score += detailed_results[name]['score'] * weight
                total_weight += weight
        
        final_score = weighted_score / total_weight if total_weight > 0 else 0.5
        
        # Calculate agreement between strategies (confidence measure)
        if len(results) > 1:
            mean_score = sum(results) / len(results)
            variance = sum((s - mean_score) ** 2 for s in results) / len(results)
            agreement = 1.0 - min(variance * 4, 1.0)  # Higher agreement = lower variance
        else:
            agreement = 1.0
        
        breakdown = {
            'final_score': final_score,
            'method': 'ensemble',
            'strategy_results': detailed_results,
            'individual_scores': results,
            'agreement_score': agreement,
            'weights_used': weights
        }
        
        return final_score, breakdown


def main():
    """Main function to run the belief measurement pipeline."""
    print("🚀 Belief Measurement Pipeline")
    print("=" * 50)
    
    # Test cases
    test_cases = [
        {
            'name': 'Strong Agreement',
            'proposition': 'Climate change is a serious threat',
            'text': 'I absolutely agree that climate change poses a significant danger to our planet and requires immediate action.',
            'expected_range': (0.8, 1.0)
        },
        {
            'name': 'Strong Disagreement', 
            'proposition': 'Vaccines are dangerous',
            'text': 'I completely disagree. Vaccines are safe, effective, and this claim is utterly false according to scientific evidence.',
            'expected_range': (0.0, 0.2)
        },
        {
            'name': 'Qualified Agreement',
            'proposition': 'Remote work increases productivity',
            'text': 'I think remote work can increase productivity for some people, but it depends on the individual and the type of work.',
            'expected_range': (0.5, 0.8)
        },
        {
            'name': 'Neutral/Irrelevant',
            'proposition': 'Pizza is the best food',
            'text': 'I went to the store yesterday and bought some groceries. The weather was nice and I saw a dog.',
            'expected_range': (0.4, 0.6)
        },
        {
            'name': 'Subtle Disagreement',
            'proposition': 'Social media improves mental health',
            'text': 'While social media connects people, research suggests it might actually increase anxiety and depression rather than helping.',
            'expected_range': (0.2, 0.5)
        }
    ]
    
    strategies = ['pattern', 'prompting', 'logprob', 'ensemble']
    
    print("📊 Running Comprehensive Tests")
    print("=" * 50)
    
    for case in test_cases:
        print(f"\n✨ Test Case: {case['name']}")
        print(f"📝 Proposition: {case['proposition']}")
        print(f"💬 Text: {case['text']}")
        print(f"🎯 Expected Range: {case['expected_range']}")
        print("-" * 40)
        
        for strategy in strategies:
            pipeline = BeliefMeasurementPipeline(strategy=strategy)
            score, breakdown = pipeline.measure_belief(case['proposition'], case['text'])
            
            in_range = case['expected_range'][0] <= score <= case['expected_range'][1]
            status = "✅ PASS" if in_range else "⚠️ REVIEW"
            
            print(f"{strategy:>12}: {score:.3f} {status}")
            
            # Show detailed breakdown for ensemble
            if strategy == 'ensemble' and 'strategy_results' in breakdown:
                individual_scores = []
                for strat_name, result in breakdown['strategy_results'].items():
                    if 'score' in result:
                        individual_scores.append(f"{strat_name}: {result['score']:.3f}")
                print(f"{'':>12}  └─ {', '.join(individual_scores)}")
                print(f"{'':>12}  └─ Agreement: {breakdown.get('agreement_score', 0):.3f}")
        
        print()

    # Interactive Demo
    print("\n🎮 Interactive Demo")
    print("=" * 30)
    
    demo_cases = [
        {
            'proposition': 'Artificial intelligence will benefit humanity',
            'text': 'AI has tremendous potential to solve complex problems in healthcare, climate change, and education. While there are risks to manage, the benefits far outweigh the concerns when developed responsibly.'
        },
        {
            'proposition': 'Working from home is more productive',
            'text': 'I completely disagree with remote work being more productive. In my experience, office collaboration and direct supervision lead to much better results.'
        }
    ]
    
    pipeline = BeliefMeasurementPipeline('ensemble')
    
    for i, demo in enumerate(demo_cases, 1):
        print(f"\n📋 Demo {i}:")
        print(f"Proposition: {demo['proposition']}")
        print(f"Text: {demo['text']}")
        
        score, breakdown = pipeline.measure_belief(demo['proposition'], demo['text'])
        
        # Interpret the score
        if score >= 0.8:
            interpretation = "🔥 Strong Agreement"
        elif score >= 0.6:
            interpretation = "✅ Agreement"
        elif score >= 0.4:
            interpretation = "🤔 Neutral/Mixed"
        elif score >= 0.2:
            interpretation = "❌ Disagreement"
        else:
            interpretation = "🚫 Strong Disagreement"
        
        print(f"\n📊 Final Score: {score:.3f}")
        print(f"🎯 Interpretation: {interpretation}")
        
        print("🔍 Strategy Breakdown:")
        for strategy_name, result in breakdown.get('strategy_results', {}).items():
            if 'score' in result:
                print(f"  • {strategy_name.capitalize()}: {result['score']:.3f}")
        
        print(f"  • Strategy Agreement: {breakdown.get('agreement_score', 0):.3f}")
        print("-" * 30)

    print("\n✨ Pipeline execution completed!")
    print("💡 Use this code to analyze belief alignment in your own text data.")


if __name__ == "__main__":
    main()

🚀 Belief Measurement Pipeline
📊 Running Comprehensive Tests

✨ Test Case: Strong Agreement
📝 Proposition: Climate change is a serious threat
💬 Text: I absolutely agree that climate change poses a significant danger to our planet and requires immediate action.
🎯 Expected Range: (0.8, 1.0)
----------------------------------------
     pattern: 0.793 ⚠️ REVIEW
   prompting: 0.900 ✅ PASS
     logprob: 0.748 ⚠️ REVIEW
    ensemble: 0.827 ✅ PASS
              └─ pattern: 0.793, prompting: 0.900, logprob: 0.748
              └─ Agreement: 0.984


✨ Test Case: Strong Disagreement
📝 Proposition: Vaccines are dangerous
💬 Text: I completely disagree. Vaccines are safe, effective, and this claim is utterly false according to scientific evidence.
🎯 Expected Range: (0.0, 0.2)
----------------------------------------
     pattern: 0.264 ⚠️ REVIEW
   prompting: 0.700 ⚠️ REVIEW
     logprob: 0.600 ⚠️ REVIEW
    ensemble: 0.505 ⚠️ REVIEW
              └─ pattern: 0.264, prompting: 0.700, logprob: 0.600
