# Lab 3.4.2: Self-Consistency Implementation - SOLUTIONS

This notebook contains complete solutions to all exercises from Lab 3.4.2.

In [None]:
import ollama
import json
import re
import time
from collections import Counter
from typing import List, Dict, Optional, Tuple, Any
from dataclasses import dataclass

MODEL = "llama3.1:8b"  # Adjust as needed

## Solution: Advanced Answer Extraction

Complete solution for extracting answers from diverse formats:

In [None]:
class AdvancedAnswerExtractor:
    """
    Solution: Robust answer extraction for multiple formats.
    
    Handles:
    - Numerical answers (with units, currency, fractions)
    - Multiple choice (A, B, C, D)
    - Yes/No/True/False
    - Named entities
    - Lists of items
    """
    
    def __init__(self):
        self.patterns = {
            'numerical': [
                r"[Tt]he (?:final )?answer is[:\s]+\$?([\d,]+(?:\.\d+)?)",
                r"[Aa]nswer[:\s]+\$?([\d,]+(?:\.\d+)?)",
                r"=\s*\$?([\d,]+(?:\.\d+)?)\s*(?:$|\.|\n)",
                r"(?:result|total|sum)[:\s]+\$?([\d,]+(?:\.\d+)?)",
            ],
            'multiple_choice': [
                r"[Tt]he (?:correct )?answer is[:\s]+([A-D])",
                r"[Aa]nswer[:\s]+([A-D])\b",
                r"\b([A-D])\)\s+is correct",
                r"Option\s+([A-D])",
            ],
            'boolean': [
                r"[Tt]he answer is[:\s]+(yes|no|true|false)",
                r"\b(yes|no|true|false)[.,]?\s*$",
            ],
            'fraction': [
                r"([\d]+)/([\d]+)",
                r"([\d]+)\s+out of\s+([\d]+)",
            ]
        }
    
    def extract(self, response: str, answer_type: str = 'auto') -> Any:
        """Extract answer based on type."""
        response_lower = response.lower()
        
        if answer_type == 'auto':
            answer_type = self._detect_type(response)
        
        if answer_type == 'numerical':
            return self._extract_numerical(response)
        elif answer_type == 'multiple_choice':
            return self._extract_choice(response)
        elif answer_type == 'boolean':
            return self._extract_boolean(response_lower)
        elif answer_type == 'fraction':
            return self._extract_fraction(response)
        else:
            return self._extract_numerical(response)  # Default
    
    def _detect_type(self, response: str) -> str:
        """Auto-detect answer type."""
        response_lower = response.lower()
        
        # Check for multiple choice indicators
        if re.search(r'\b[A-D]\)', response) or 'option' in response_lower:
            return 'multiple_choice'
        
        # Check for boolean
        if re.search(r'\b(yes|no|true|false)\b', response_lower):
            return 'boolean'
        
        # Check for fractions
        if re.search(r'\d+/\d+', response) or 'out of' in response_lower:
            return 'fraction'
        
        # Default to numerical
        return 'numerical'
    
    def _extract_numerical(self, response: str) -> Optional[float]:
        """Extract numerical answer."""
        for pattern in self.patterns['numerical']:
            matches = re.findall(pattern, response)
            if matches:
                try:
                    return float(matches[-1].replace(',', ''))
                except:
                    continue
        
        # Fallback: find any number
        numbers = re.findall(r'-?[\d,]+(?:\.\d+)?', response)
        if numbers:
            try:
                return float(numbers[-1].replace(',', ''))
            except:
                pass
        return None
    
    def _extract_choice(self, response: str) -> Optional[str]:
        """Extract multiple choice answer."""
        for pattern in self.patterns['multiple_choice']:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                return match.group(1).upper()
        return None
    
    def _extract_boolean(self, response: str) -> Optional[bool]:
        """Extract boolean answer."""
        for pattern in self.patterns['boolean']:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                answer = match.group(1).lower()
                return answer in ('yes', 'true')
        return None
    
    def _extract_fraction(self, response: str) -> Optional[float]:
        """Extract fraction and convert to decimal."""
        for pattern in self.patterns['fraction']:
            match = re.search(pattern, response)
            if match:
                try:
                    num = float(match.group(1))
                    den = float(match.group(2))
                    return num / den if den != 0 else None
                except:
                    continue
        return None


# Test the extractor
extractor = AdvancedAnswerExtractor()

test_cases = [
    ("After calculation, the answer is $150.50", 'numerical'),
    ("Based on my analysis, option B is correct.", 'multiple_choice'),
    ("Is the sky blue? Yes.", 'boolean'),
    ("The probability is 3/4 or 75%.", 'fraction'),
]

print("Testing Advanced Answer Extractor:")
print("=" * 50)

for response, expected_type in test_cases:
    result = extractor.extract(response, 'auto')
    detected = extractor._detect_type(response)
    print(f"Response: {response[:40]}...")
    print(f"  Detected type: {detected}")
    print(f"  Extracted: {result}")
    print()

## Solution: Weighted Voting Self-Consistency

In [None]:
@dataclass
class SampledResponse:
    """Container for a sampled response with metadata."""
    response: str
    answer: Any
    confidence: float
    reasoning_steps: int
    latency: float


class WeightedSelfConsistency:
    """
    Solution: Self-consistency with weighted voting based on response quality.
    
    Weights can be based on:
    - Response length (longer = more thorough)
    - Reasoning steps (more steps = more detailed)
    - Confidence markers (explicit confidence statements)
    - Token probability (if available)
    """
    
    def __init__(self, model: str = MODEL, n_samples: int = 5):
        self.model = model
        self.n_samples = n_samples
        self.extractor = AdvancedAnswerExtractor()
    
    def sample_with_cot(self, question: str, temperature: float = 0.7) -> SampledResponse:
        """Generate one CoT response with metadata."""
        prompt = f"{question}\n\nLet's think step by step:"
        
        start = time.time()
        response = ollama.chat(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            options={"temperature": temperature, "num_predict": 1024}
        )
        latency = time.time() - start
        
        text = response['message']['content']
        answer = self.extractor.extract(text)
        confidence = self._estimate_confidence(text)
        steps = self._count_reasoning_steps(text)
        
        return SampledResponse(
            response=text,
            answer=answer,
            confidence=confidence,
            reasoning_steps=steps,
            latency=latency
        )
    
    def _estimate_confidence(self, response: str) -> float:
        """Estimate confidence from response text."""
        confidence = 0.5  # Base confidence
        
        # Boost for explicit confidence markers
        high_conf_markers = ['definitely', 'certainly', 'clearly', 'obviously', 'therefore']
        low_conf_markers = ['maybe', 'perhaps', 'might', 'not sure', 'uncertain']
        
        response_lower = response.lower()
        
        for marker in high_conf_markers:
            if marker in response_lower:
                confidence += 0.1
        
        for marker in low_conf_markers:
            if marker in response_lower:
                confidence -= 0.1
        
        # Boost for verification steps
        if 'let me verify' in response_lower or 'double check' in response_lower:
            confidence += 0.15
        
        return max(0.1, min(1.0, confidence))
    
    def _count_reasoning_steps(self, response: str) -> int:
        """Count explicit reasoning steps."""
        patterns = [
            r'^\d+[.)\s]',  # Numbered steps
            r'^[-*]\s',     # Bullet points
            r'^Step\s+\d+',  # "Step N"
            r'^First,|^Second,|^Third,|^Finally,',  # Ordinal markers
        ]
        
        count = 0
        for line in response.split('\n'):
            for pattern in patterns:
                if re.match(pattern, line.strip(), re.IGNORECASE):
                    count += 1
                    break
        
        return max(1, count)
    
    def weighted_vote(self, samples: List[SampledResponse], weight_type: str = 'confidence') -> Tuple[Any, float]:
        """Perform weighted voting on samples."""
        valid_samples = [s for s in samples if s.answer is not None]
        
        if not valid_samples:
            return None, 0.0
        
        # Calculate weights based on type
        weights = {}
        for s in valid_samples:
            if weight_type == 'confidence':
                w = s.confidence
            elif weight_type == 'steps':
                w = s.reasoning_steps / 10.0  # Normalize
            elif weight_type == 'combined':
                w = s.confidence * (1 + s.reasoning_steps / 10.0)
            else:
                w = 1.0  # Uniform
            
            # Round numerical answers for grouping
            if isinstance(s.answer, float):
                key = round(s.answer, 2)
            else:
                key = s.answer
            
            weights[key] = weights.get(key, 0) + w
        
        # Find answer with highest weight
        best_answer = max(weights.keys(), key=lambda k: weights[k])
        total_weight = sum(weights.values())
        confidence = weights[best_answer] / total_weight if total_weight > 0 else 0
        
        return best_answer, confidence
    
    def run(self, question: str, temperature: float = 0.7, weight_type: str = 'combined') -> Dict:
        """Run weighted self-consistency."""
        print(f"Generating {self.n_samples} samples...")
        
        samples = []
        for i in range(self.n_samples):
            sample = self.sample_with_cot(question, temperature)
            samples.append(sample)
            print(f"  Sample {i+1}: answer={sample.answer}, conf={sample.confidence:.2f}, steps={sample.reasoning_steps}")
        
        # Vote with different weighting schemes
        results = {}
        for wt in ['uniform', 'confidence', 'steps', 'combined']:
            answer, conf = self.weighted_vote(samples, wt)
            results[wt] = {'answer': answer, 'confidence': conf}
        
        # Use requested weight type for final answer
        final_answer, final_conf = results[weight_type]['answer'], results[weight_type]['confidence']
        
        return {
            'answer': final_answer,
            'confidence': final_conf,
            'samples': samples,
            'all_results': results
        }


# Test weighted self-consistency
wsc = WeightedSelfConsistency(n_samples=5)

question = "A store has 150 apples. They sell 30% in the morning and 25% of the remaining in the afternoon. How many apples are left?"

print(f"Question: {question}\n")
result = wsc.run(question)

print(f"\nFinal Answer: {result['answer']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"\nResults by weighting scheme:")
for scheme, data in result['all_results'].items():
    print(f"  {scheme}: {data['answer']} (conf={data['confidence']:.2%})")

## Solution: Adaptive Self-Consistency

In [None]:
class AdaptiveSelfConsistency:
    """
    Solution: Dynamically adjust number of samples based on agreement.
    
    Strategy:
    - Start with min_samples
    - If agreement is high (>threshold), stop early
    - If agreement is low, continue sampling up to max_samples
    - Saves compute on easy problems, invests more on hard ones
    """
    
    def __init__(
        self,
        model: str = MODEL,
        min_samples: int = 3,
        max_samples: int = 10,
        agreement_threshold: float = 0.8,
        check_interval: int = 2
    ):
        self.model = model
        self.min_samples = min_samples
        self.max_samples = max_samples
        self.agreement_threshold = agreement_threshold
        self.check_interval = check_interval
        self.extractor = AdvancedAnswerExtractor()
    
    def _get_agreement(self, answers: List[Any]) -> Tuple[Any, float]:
        """Calculate agreement rate for current answers."""
        if not answers:
            return None, 0.0
        
        # Normalize numerical answers
        normalized = []
        for a in answers:
            if a is None:
                continue
            if isinstance(a, float):
                normalized.append(round(a, 2))
            else:
                normalized.append(a)
        
        if not normalized:
            return None, 0.0
        
        counts = Counter(normalized)
        most_common, count = counts.most_common(1)[0]
        agreement = count / len(normalized)
        
        return most_common, agreement
    
    def sample_one(self, question: str, temperature: float = 0.7) -> Tuple[str, Any]:
        """Generate one sample."""
        prompt = f"{question}\n\nLet's think step by step:"
        
        response = ollama.chat(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            options={"temperature": temperature, "num_predict": 1024}
        )
        
        text = response['message']['content']
        answer = self.extractor.extract(text)
        return text, answer
    
    def run(self, question: str, temperature: float = 0.7) -> Dict:
        """Run adaptive self-consistency."""
        responses = []
        answers = []
        
        print(f"Adaptive Self-Consistency (min={self.min_samples}, max={self.max_samples}, threshold={self.agreement_threshold})")
        
        for i in range(self.max_samples):
            text, answer = self.sample_one(question, temperature)
            responses.append(text)
            answers.append(answer)
            
            print(f"  Sample {i+1}: {answer}")
            
            # Check agreement after minimum samples
            if (i + 1) >= self.min_samples and (i + 1) % self.check_interval == 0:
                best_answer, agreement = self._get_agreement(answers)
                print(f"    -> Agreement check: {agreement:.1%}")
                
                if agreement >= self.agreement_threshold:
                    print(f"  âœ“ Early stopping at {i+1} samples (agreement={agreement:.1%})")
                    break
        
        final_answer, final_agreement = self._get_agreement(answers)
        
        return {
            'answer': final_answer,
            'agreement': final_agreement,
            'num_samples': len(answers),
            'all_answers': answers,
            'early_stopped': len(answers) < self.max_samples
        }


# Test adaptive self-consistency
asc = AdaptiveSelfConsistency(
    min_samples=3,
    max_samples=8,
    agreement_threshold=0.75
)

# Easy question (should stop early)
easy_q = "What is 25% of 200?"
print(f"\nEasy question: {easy_q}")
result = asc.run(easy_q)
print(f"Result: {result['answer']} (samples used: {result['num_samples']}, early_stop: {result['early_stopped']})")

# Harder question (may need more samples)
hard_q = "A train travels at 60mph for 2.5 hours, then 45mph for 1.5 hours. What's the average speed for the whole journey?"
print(f"\nHarder question: {hard_q}")
result = asc.run(hard_q)
print(f"Result: {result['answer']} (samples used: {result['num_samples']}, early_stop: {result['early_stopped']})")

## Solution: Self-Consistency Evaluation Framework

In [None]:
def comprehensive_sc_evaluation(
    problems: List[Dict],
    n_problems: int = 10,
    samples_to_test: List[int] = [1, 3, 5, 7]
) -> Dict:
    """
    Solution: Comprehensive evaluation of self-consistency.
    
    Tests:
    - Different numbers of samples
    - Accuracy vs compute tradeoff
    - Agreement as confidence proxy
    """
    extractor = AdvancedAnswerExtractor()
    
    results = {n: {'correct': 0, 'total': 0, 'avg_agreement': 0, 'latency': 0} for n in samples_to_test}
    
    for i, prob in enumerate(problems[:n_problems]):
        question = prob['question']
        expected = prob.get('numerical_answer', prob.get('answer'))
        
        print(f"\nProblem {i+1}: {question[:50]}...")
        
        # Generate max samples once
        max_n = max(samples_to_test)
        all_samples = []
        
        start = time.time()
        for _ in range(max_n):
            prompt = f"{question}\n\nLet's think step by step:"
            response = ollama.chat(
                model=MODEL,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": 0.7, "num_predict": 512}
            )
            answer = extractor.extract(response['message']['content'])
            all_samples.append(answer)
        total_time = time.time() - start
        
        # Evaluate for each sample count
        for n in samples_to_test:
            subset = all_samples[:n]
            valid = [a for a in subset if a is not None]
            
            if valid:
                counts = Counter([round(a, 2) if isinstance(a, float) else a for a in valid])
                predicted, count = counts.most_common(1)[0]
                agreement = count / len(valid)
            else:
                predicted = None
                agreement = 0
            
            try:
                correct = abs(float(predicted or 0) - float(expected)) < 0.01 * abs(float(expected))
            except:
                correct = False
            
            results[n]['total'] += 1
            results[n]['correct'] += int(correct)
            results[n]['avg_agreement'] += agreement
            results[n]['latency'] += (total_time * n / max_n)  # Proportional
            
            print(f"  n={n}: pred={predicted}, correct={correct}, agreement={agreement:.1%}")
    
    # Calculate summaries
    print("\n" + "=" * 60)
    print("SELF-CONSISTENCY EVALUATION RESULTS")
    print("=" * 60)
    print(f"{'Samples':<10} {'Accuracy':<12} {'Avg Agreement':<15} {'Avg Latency':<12}")
    print("-" * 60)
    
    for n in samples_to_test:
        data = results[n]
        accuracy = data['correct'] / data['total'] if data['total'] > 0 else 0
        avg_agree = data['avg_agreement'] / data['total'] if data['total'] > 0 else 0
        avg_lat = data['latency'] / data['total'] if data['total'] > 0 else 0
        
        print(f"n={n:<7} {accuracy:<12.1%} {avg_agree:<15.1%} {avg_lat:<12.2f}s")
    
    return results


# Load test problems
try:
    with open("../data/gsm8k_sample.json") as f:
        problems = json.load(f)
    
    # Run evaluation (uncomment to execute)
    # comprehensive_sc_evaluation(problems, n_problems=5, samples_to_test=[1, 3, 5])
    print("Problems loaded. Uncomment the evaluation call to run.")
except FileNotFoundError:
    print("Sample problems not found. Skipping evaluation.")

## Key Takeaways

1. **Temperature matters**: Use temperature > 0 (e.g., 0.7) for diverse reasoning paths
2. **More samples = higher accuracy** but with diminishing returns (usually 5-10 is sufficient)
3. **Weighted voting** can improve over simple majority voting
4. **Adaptive sampling** saves compute on easy problems
5. **Agreement rate** is a good proxy for confidence