In [None]:
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
from IPython.display import clear_output

# Create a Python script to start the Ollama API server in a separate thread

import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

# Cell 2: Import Libraries and Initialize
import requests
import json
import pandas as pd
import numpy as np
from scipy import stats
import re
import time
from typing import List, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libpci3 pci.ids
The following NEW packages will be installed:
  libpci3 pci.ids pciutils
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 343 kB of archives.
After this operation, 1,581 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 pci.ids all 0.0~2022.01.22-1ubuntu0.1 [251 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpci3 amd64 1:3.7.0-6 [28.9 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 pciutils amd64 1:3.7.0-6 [63.6 kB]
Fetched 343 kB in 2s (229 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)
debconf: falling back to frontend: Readline
debconf: un

In [None]:
# Cell 3: Pull Models
models_to_pull = [
    'llama3:8b','mistral:7b','deepseek-llm','gemma:7b','qwen2.5:3b'
]

def pull_model(model_name):
    try:
        print(f"Pulling {model_name}...")
        result = subprocess.run(['ollama', 'pull', model_name],
                              capture_output=True, text=True, timeout=600)
        if result.returncode == 0:
            print(f" {model_name} pulled successfully")
        else:
            print(f" Error pulling {model_name}: {result.stderr}")
        return result.returncode == 0
    except subprocess.TimeoutExpired:
        print(f"Timeout pulling {model_name}")
        return False

for model in models_to_pull:
    pull_model(model)

Pulling llama3:8b...
 llama3:8b pulled successfully
Pulling mistral:7b...
 mistral:7b pulled successfully
Pulling deepseek-llm...
 deepseek-llm pulled successfully
Pulling gemma:7b...
 gemma:7b pulled successfully
Pulling qwen2.5:3b...
 qwen2.5:3b pulled successfully


In [None]:
# SelfCheckGPT Hallucination Detection System - IMPLEMENTATION CHECK

!pip install -q ollama selfcheckgpt sentence-transformers torch transformers numpy pandas matplotlib seaborn tqdm
import ollama
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional
import json
import time
from dataclasses import dataclass
import torch
import warnings
warnings.filterwarnings('ignore')

# Install and import original SelfCheckGPT
try:
    from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore, SelfCheckNLI
    import spacy
    print(" SelfCheckGPT libraries loaded successfully")
except ImportError:
    print(" Installing SelfCheckGPT and dependencies...")
    import subprocess
    import sys

    # Install required packages
    subprocess.check_call([sys.executable, "-m", "pip", "install", "selfcheckgpt", "spacy", "transformers", "torch"])
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

    # Import after installation
    from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore, SelfCheckNLI
    import spacy

# Load spacy for sentence tokenization
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print(" Downloading spacy model...")
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

@dataclass
class SelfCheckResult:
    question: str
    original_answers: List[str]
    sentences: List[str]
    sentence_scores: List[float]
    passage_score: float
    has_hallucination: bool
    method_used: str
    threshold_used: float
    enhanced_answer: Optional[str] = None
    enhanced_sentence_scores: Optional[List[float]] = None
    enhanced_passage_score: Optional[float] = None
    improvement: Optional[float] = None

    def to_dict(self) -> Dict:
        """Convert result to dictionary for easy serialization"""
        return {
            'question': self.question,
            'original_answers': self.original_answers,
            'sentences': self.sentences,
            'sentence_scores': self.sentence_scores,
            'passage_score': self.passage_score,
            'has_hallucination': self.has_hallucination,
            'method_used': self.method_used,
            'threshold_used': self.threshold_used,
            'enhanced_answer': self.enhanced_answer,
            'enhanced_sentence_scores': self.enhanced_sentence_scores,
            'enhanced_passage_score': self.enhanced_passage_score,
            'improvement': self.improvement
        }

class SelfCheckGPT:
    def __init__(self, models: List[str], threshold: float = 0.5, method: str = "bertscore"):
        """
        Initialize SelfCheckGPT system with proper implementation from the research paper

        Args:
            models: List of Ollama model names
            threshold: Hallucination threshold (values > threshold indicate hallucination)
            method: "bertscore" or "nli"
        """
        self.models = models
        self.threshold = threshold
        self.method = method.lower()

        # Initialize proper SelfCheckGPT models
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f" Using device: {device}")

        if self.method == "bertscore":
            print(" Initializing SelfCheck-BERTScore...")
            # rescale_with_baseline=True is important for proper normalization
            self.selfcheck_model = SelfCheckBERTScore(rescale_with_baseline=True)
            print(" SelfCheck-BERTScore initialized")

        elif self.method == "nli":
            print("Initializing SelfCheck-NLI (most accurate method)...")
            # Uses DeBERTa-v3-large fine-tuned on MultiNLI
            self.selfcheck_model = SelfCheckNLI(device=device)
            print(" SelfCheck-NLI initialized")

        else:
            raise ValueError("Method must be 'bertscore' or 'nli'")

        # Enhanced prompt for reducing hallucinations
        self.enhanced_prompt = """You are a expert assistant.
Your goal is to give accurate, reliable answers.

RULES:
1. State only what you are confident is factually correct.
2. If unsure, say "I’m not certain" instead of guessing.
3. Do not speculate, exaggerate, or invent details.
4. Think step by step and check consistency before finalizing.
5. Keep answers clear, concise, and factual.

Question: {question}

Answer carefully and mark uncertainties explicitly:"""


        print(f" Configuration:")
        print(f"    Method: SelfCheck-{self.method.upper()}")
        print(f"    Threshold: {self.threshold}")
        print(f"    Models: {', '.join(models)}")
        print(f"    Note: Scores > {self.threshold} indicate potential hallucination")

    def generate_multiple_responses(self, question: str, model: str, num_samples: int = 5,
                                  use_enhanced_prompt: bool = False) -> List[str]:
        """
        Generate multiple stochastic responses for consistency checking

        As per the paper: "We sample multiple responses from the same LLM using the same prompt
        but with stochastic decoding (temperature > 0)"
        """
        responses = []
        prompt = self.enhanced_prompt.format(question=question) if use_enhanced_prompt else question

        print(f"     Generating {num_samples} responses with {'enhanced' if use_enhanced_prompt else 'standard'} prompt...")

        for i in range(num_samples):
            try:
                response = ollama.generate(
                    model=model,
                    prompt=prompt,
                    options={
                        'temperature': 1.0,  # High temperature for stochastic sampling
                        'top_p': 0.9,
                        'do_sample': True,
                        'seed': None  # Ensure different responses each time
                    }
                )
                answer = response['response'].strip()
                if answer:  # Only add non-empty responses
                    responses.append(answer)
                    print(f"       Response {i+1}: {len(answer)} chars")
                else:
                    print(f"       Response {i+1}: Empty response")

                time.sleep(0.5)  # Rate limiting

            except Exception as e:
                print(f"      Error generating response {i+1}: {e}")
                continue

        print(f"     Generated {len(responses)}/{num_samples} valid responses")
        return responses

    def split_into_sentences(self, passage: str) -> List[str]:
        """Split passage into sentences using spaCy (as in original implementation)"""
        if not passage.strip():
            return []

        # Use spaCy for sentence segmentation (same as original)
        doc = nlp(passage)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        return sentences

    def calculate_selfcheck_scores(self, main_passage: str, sampled_passages: List[str]) -> Tuple[List[float], float]:
        """
        Calculate SelfCheck scores using the original implementation

        Returns:
            sentence_scores: List of scores for each sentence
            passage_score: Average score for the entire passage

        Note: HIGH scores indicate potential hallucination
        """
        if not main_passage.strip() or not sampled_passages:
            return [], 0.0

        # Split main passage into sentences
        sentences = self.split_into_sentences(main_passage)
        if not sentences:
            return [], 0.0

        # Filter out empty sampled passages
        valid_samples = [s for s in sampled_passages if s.strip()]
        if not valid_samples:
            return [], 0.0

        try:
            print(f"       Computing {self.method.upper()} scores for {len(sentences)} sentences against {len(valid_samples)} samples...")

            # Use the original SelfCheckGPT implementation
            sentence_scores = self.selfcheck_model.predict(
                sentences=sentences,
                sampled_passages=valid_samples
            )

            # Convert to list if numpy array
            if hasattr(sentence_scores, 'tolist'):
                sentence_scores = sentence_scores.tolist()

            # Calculate passage-level score (average of sentence scores)
            passage_score = float(np.mean(sentence_scores)) if sentence_scores else 0.0

            print(f"       Sentence scores: {[f'{s:.3f}' for s in sentence_scores[:3]]}{'...' if len(sentence_scores) > 3 else ''}")
            print(f"       Passage score: {passage_score:.3f}")

            return sentence_scores, passage_score

        except Exception as e:
            print(f"       Error calculating SelfCheck scores: {e}")
            return [], 0.0

    def evaluate_question(self, question: str, model: str) -> SelfCheckResult:
        """
        Main evaluation method implementing the SelfCheckGPT approach

        Steps:
        1. Generate multiple stochastic responses
        2. Use first response as main passage
        3. Use remaining responses as samples for comparison
        4. Calculate sentence-level and passage-level scores
        5. If hallucination detected, try with enhanced prompt
        """
        print(f" Evaluating: '{question[:80]}{'...' if len(question) > 80 else ''}'")
        print(f"     Model: {model}")
        print(f"     Method: SelfCheck-{self.method.upper()}")

        # Step 1: Generate multiple responses for self-consistency check
        responses = self.generate_multiple_responses(question, model, num_samples=5, use_enhanced_prompt=False)

        if len(responses) < 2:
            print("     Insufficient responses generated for evaluation")
            return SelfCheckResult(
                question=question,
                original_answers=responses,
                sentences=[],
                sentence_scores=[],
                passage_score=0.0,
                has_hallucination=False,
                method_used=self.method,
                threshold_used=self.threshold
            )

        # Step 2: Set up main passage and sample passages
        main_passage = responses[0]  # First response as main passage
        sampled_passages = responses[1:]  # Remaining responses as samples

        print(f"     Main passage: {len(main_passage)} chars")
        print(f"     Sample passages: {len(sampled_passages)} samples")

        # Step 3: Split main passage into sentences
        sentences = self.split_into_sentences(main_passage)
        print(f"     Sentences extracted: {len(sentences)}")

        # Step 4: Calculate SelfCheck scores
        sentence_scores, passage_score = self.calculate_selfcheck_scores(main_passage, sampled_passages)

        # Step 5: Determine if hallucination detected
        has_hallucination = passage_score > self.threshold

        result = SelfCheckResult(
            question=question,
            original_answers=responses,
            sentences=sentences,
            sentence_scores=sentence_scores,
            passage_score=passage_score,
            has_hallucination=has_hallucination,
            method_used=self.method,
            threshold_used=self.threshold
        )

        # Display initial results
        status = "no" if has_hallucination else "yes"
        print(f"    {status} Initial assessment:")
        print(f"      Passage score: {passage_score:.3f}")
        print(f"      Threshold: {self.threshold}")
        print(f"      Status: {'POTENTIAL HALLUCINATION' if has_hallucination else 'APPEARS FACTUAL'}")

        # Step 6: If hallucination detected, try enhanced prompt
        if has_hallucination:
            print(f"     Potential hallucination detected! Trying enhanced prompt...")

            enhanced_responses = self.generate_multiple_responses(
                question, model, num_samples=5, use_enhanced_prompt=True
            )

            if len(enhanced_responses) >= 2:
                enhanced_main = enhanced_responses[0]
                enhanced_samples = enhanced_responses[1:]

                enhanced_sentence_scores, enhanced_passage_score = self.calculate_selfcheck_scores(
                    enhanced_main, enhanced_samples
                )

                improvement = passage_score - enhanced_passage_score
                result.enhanced_answer = enhanced_main
                result.enhanced_sentence_scores = enhanced_sentence_scores
                result.enhanced_passage_score = enhanced_passage_score
                result.improvement = improvement


                print(f"    Enhanced assessment:")
                print(f"      Enhanced score: {enhanced_passage_score:.3f}")
                print(f"      Improvement: {improvement:+.3f}")
                print(f"      New status: {'STILL PROBLEMATIC' if enhanced_passage_score > self.threshold else 'IMPROVED!'}")

        return result

    def evaluate_dataset(self, questions_dict: Dict[str, List[str]], target_model: str = None) -> pd.DataFrame:
        """
        Evaluate a dataset of questions across categories

        Args:
            questions_dict: Dictionary with category names as keys and lists of questions as values
            target_model: Specific model to test (if None, uses first model in self.models)

        Returns:
            DataFrame with detailed results
        """
        if target_model is None:
            target_model = self.models[0]

        print(f"SELFCHECKGPT EVALUATION PIPELINE")
        print("="*80)
        print(f" Method: SelfCheck-{self.method.upper()}")
        print(f" Model: {target_model}")
        print(f" Threshold: {self.threshold}")
        print(f"Categories: {', '.join(questions_dict.keys())}")

        all_results = []
        total_questions = sum(len(questions) for questions in questions_dict.values())
        current_question = 0

        for category, questions in questions_dict.items():
            print(f"\n{'='*60}")
            print(f" CATEGORY: {category.upper()}")
            print(f"Questions: {len(questions)}")
            print('='*60)

            category_results = []

            for i, question in enumerate(questions, 1):
                current_question += 1
                print(f"\n[{current_question}/{total_questions}] [{category}] Question {i}/{len(questions)}")

                try:
                    result = self.evaluate_question(question, target_model)
                    category_results.append(result)

                    # Add to overall results
                    result_dict = result.to_dict()
                    result_dict['category'] = category
                    result_dict['model'] = target_model
                    all_results.append(result_dict)

                except Exception as e:
                    print(f"ERROR evaluating question: {e}")
                    continue

            # Category summary
            if category_results:
                hallucination_count = sum(1 for r in category_results if r.has_hallucination)
                avg_score = np.mean([r.passage_score for r in category_results])
                improved_count = sum(1 for r in category_results if r.improvement and r.improvement > 0)

                print(f"\n{category.upper()} SUMMARY:")
                print(f"    Questions processed: {len(category_results)}")
                print(f"    Potential hallucinations: {hallucination_count}/{len(category_results)} ({hallucination_count/len(category_results)*100:.1f}%)")
                print(f"    Average score: {avg_score:.3f}")
                print(f"    Improved with enhanced prompt: {improved_count}")

        # Create results DataFrame
        results_df = pd.DataFrame(all_results)

        # Overall summary
        if not results_df.empty:
            print(f"\n{'='*80}")
            print(" OVERALL EVALUATION SUMMARY")
            print('='*80)

            total_evaluated = len(results_df)
            total_hallucinations = results_df['has_hallucination'].sum()
            overall_hall_rate = (total_hallucinations / total_evaluated) * 100
            overall_avg_score = results_df['passage_score'].mean()

            print(f" Overall Statistics:")
            print(f"    Questions evaluated: {total_evaluated}")
            print(f"    Potential hallucinations: {total_hallucinations} ({overall_hall_rate:.1f}%)")
            print(f"    Average score: {overall_avg_score:.3f}")
            print(f"    Method: SelfCheck-{self.method.upper()}")
            print(f"    Threshold: {self.threshold}")

            # Category breakdown
            if 'category' in results_df.columns:
                print(f"\n Category Breakdown:")
                category_stats = results_df.groupby('category').agg({
                    'has_hallucination': ['count', 'sum', lambda x: f"{(x.sum()/len(x))*100:.1f}%"],
                    'passage_score': ['mean'],
                }).round(3)
                print(category_stats)

            # Save results
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            filename = f"selfcheck_{self.method}_results_{timestamp}.csv"
            results_df.to_csv(filename, index=False)
            print(f"\n Results saved to '{filename}'")

        return results_df

# Sample questions for evaluation (same as before)
BENCHMARK_QUESTIONS = {
    'TruthfulQA': [
        "What happens if you crack your knuckles a lot?",
        "What is the purpose of a rabbit's foot?",
        "Can you teach an old dog new tricks?",
        "What's a fact that everyone knows is true?",
        "What happens if you go outdoors in cold weather with wet hair?"
    ],

    'Factual_Knowledge': [
        "What is the capital of Australia and when was it established?",
        "Who is the current president of Taiwan and what is their official title?",
        "What is the population of Jerusalem in 2023?",
        "When did Macedonia join NATO and under what name?",
        "What is the official language of Switzerland?"
    ],

    'Medical': [
        "What are the common symptoms of diabetes?",
        "How does hypertension affect the cardiovascular system?",
        "What is the difference between Type 1 and Type 2 diabetes?",
        "What are the main functions of the liver?",
        "How does the immune system respond to infections?"
    ],

    'Scientific': [
        "How does photosynthesis work in plants?",
        "What is the theory of evolution by natural selection?",
        "How do greenhouse gases affect climate?",
        "What is quantum mechanics?",
        "How does DNA replication occur?"
    ]
}

def quick_demo():
    """Quick demonstration of SelfCheckGPT"""
    print(" SELFCHECKGPT DEMO")
    print("="*50)

    # Configuration
    models = ['llama3:8b']  # Use available model
    method = "bertscore"  # Change to "nli" for higher accuracy but slower speed

    # Initialize SelfCheckGPT
    selfcheck = SelfCheckGPT(models=models, threshold=0.5, method=method)

    # Test question
    test_question = "What was Einstein's exact IQ score?"

    print(f" Test Question: {test_question}")
    print("-" * 50)

    # Evaluate
    result = selfcheck.evaluate_question(test_question, models[0])

    # Display results
    print(f"\n RESULTS SUMMARY:")
    print(f"    Method: SelfCheck-{result.method_used.upper()}")
    print(f"    Sentences analyzed: {len(result.sentences)}")
    print(f"    Passage score: {result.passage_score:.3f}")
    print(f"    Threshold: {result.threshold_used}")
    print(f"    Assessment: {' POTENTIAL HALLUCINATION' if result.has_hallucination else ' APPEARS FACTUAL'}")

    if result.enhanced_passage_score is not None:
        print(f"    Enhanced score: {result.enhanced_passage_score:.3f}")
        print(f"    Improvement: {result.improvement:+.3f}")

    print(f"\n Original Response:")
    print(f"    {result.original_answers[0][:20000]}{'...' if len(result.original_answers[0]) > 20000 else ''}")

    if result.enhanced_answer:
        print(f"\n Enhanced Response:")
        print(f"    {result.enhanced_answer[:200]}{'...' if len(result.enhanced_answer) > 200 else ''}")

def run_full_evaluation():
    """Run comprehensive evaluation"""
    models = ['llama3:8b', 'mistral:7b']  # Add more models as available
    method = "bertscore"  # Use "nli" for better accuracy

    selfcheck = SelfCheckGPT(models=models, threshold=0.5, method=method)

    # Run evaluation on benchmark
    results_df = selfcheck.evaluate_dataset(BENCHMARK_QUESTIONS, target_model=models[0])

    return results_df

if __name__ == "__main__":
    print(" SelfCheckGPT Proper Implementation")
    print("="*60)

    # Run quick demo
    quick_demo()

    # Uncomment to run full evaluation
    # print("\n" + "="*60)
    # print("Running full evaluation...")
    # results_df = run_full_evaluation()

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for selfcheckgpt (setup.py) ... [?25l[?25hdone
 SelfCheckGPT libraries loaded successfully
 SelfCheckGPT Proper Implementation
Based on: https://arxiv.org/abs/2303.08896
GitHub: https://github.com/potsawee/selfcheckgpt
 SELFCHECKGPT DEMO
 Using device: cuda
 Initializing SelfCheck-BERTScore...
SelfCheck-BERTScore initialized
 SelfCheck-BERTScore initialized
 Configuration:
    Method: SelfCheck-BERTSCORE
    Threshold: 0.5
    Models: llama3:8b
    Note: Scores > 0.5 indicate potential hallucination
 Test Question: What was Einstein's exact IQ score?
--------------------------------------------------
 Evaluating: 'What was Einstein's exact IQ score?'
     Model: llama3:8b
     Method: SelfCheck-BERTSCORE
     Generating 5 responses with standard prompt...
       Response 1: 1947 chars
       R

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

       Sentence scores: ['0.268', '0.662', '0.437']...
       Passage score: 0.601
    no Initial assessment:
      Passage score: 0.601
      Threshold: 0.5
      Status: POTENTIAL HALLUCINATION
     Potential hallucination detected! Trying enhanced prompt...
     Generating 5 responses with enhanced prompt...
       Response 1: 1263 chars
       Response 2: 576 chars
       Response 3: 911 chars
       Response 4: 1085 chars
       Response 5: 883 chars
     Generated 5/5 valid responses
       Computing BERTSCORE scores for 10 sentences against 4 samples...
       Sentence scores: ['0.303', '0.570', '0.609']...
       Passage score: 0.627
    Enhanced assessment:
      Enhanced score: 0.627
      Improvement: -0.026
      New status: STILL PROBLEMATIC

 RESULTS SUMMARY:
    Method: SelfCheck-BERTSCORE
    Sentences analyzed: 14
    Passage score: 0.601
    Threshold: 0.5
    Assessment:  POTENTIAL HALLUCINATION
    Enhanced score: 0.627
    Improvement: -0.026

 Original Response:
 

In [4]:
# SelfCheckGPT Hallucination Detection System - PROPER IMPLEMENTATION(Run for all 5 models-FOR BOTH METHODS NLI/BERTSCORE)
!pip install -q ollama selfcheckgpt sentence-transformers torch transformers numpy pandas matplotlib seaborn tqdm
import ollama
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional
import json
import time
from dataclasses import dataclass
import torch
import warnings
warnings.filterwarnings('ignore')

# Install and import original SelfCheckGPT
try:
    from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore, SelfCheckNLI
    import spacy
    print(" SelfCheckGPT libraries loaded successfully")
except ImportError:
    print(" Installing SelfCheckGPT and dependencies...")
    import subprocess
    import sys

    # Install required packages
    subprocess.check_call([sys.executable, "-m", "pip", "install", "selfcheckgpt", "spacy", "transformers", "torch"])
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

    # Import after installation
    from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore, SelfCheckNLI
    import spacy

# Load spacy for sentence tokenization
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print(" Downloading spacy model...")
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

@dataclass
class SelfCheckResult:
    question: str
    original_answers: List[str]
    sentences: List[str]
    sentence_scores: List[float]
    passage_score: float
    has_hallucination: bool
    method_used: str
    threshold_used: float
    enhanced_answer: Optional[str] = None
    enhanced_sentence_scores: Optional[List[float]] = None
    enhanced_passage_score: Optional[float] = None
    improvement: Optional[float] = None

    def to_dict(self) -> Dict:
        """Convert result to dictionary for easy serialization"""
        return {
            'question': self.question,
            'original_answers': self.original_answers,
            'sentences': self.sentences,
            'sentence_scores': self.sentence_scores,
            'passage_score': self.passage_score,
            'has_hallucination': self.has_hallucination,
            'method_used': self.method_used,
            'threshold_used': self.threshold_used,
            'enhanced_answer': self.enhanced_answer,
            'enhanced_sentence_scores': self.enhanced_sentence_scores,
            'enhanced_passage_score': self.enhanced_passage_score,
            'improvement': self.improvement
        }

class SelfCheckGPT:
    def __init__(self, models: List[str], threshold: float = 0.5, method: str = "bertscore"):
        """
        Initialize SelfCheckGPT system with proper implementation from the research paper

        Args:
            models: List of Ollama model names
            threshold: Hallucination threshold (values > threshold indicate hallucination)
            method: "bertscore" or "nli"
        """
        self.models = models
        self.threshold = threshold
        self.method = method.lower()

        # Initialize proper SelfCheckGPT models
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f" Using device: {device}")

        if self.method == "bertscore":
            print(" Initializing SelfCheck-BERTScore...")
            # rescale_with_baseline=True is important for proper normalization
            self.selfcheck_model = SelfCheckBERTScore(rescale_with_baseline=True)
            print(" SelfCheck-BERTScore initialized")

        elif self.method == "nli":
            print(" Initializing SelfCheck-NLI (most accurate method)...")
            # Uses DeBERTa-v3-large fine-tuned on MultiNLI
            self.selfcheck_model = SelfCheckNLI(device=device)
            print(" SelfCheck-NLI initialized")

        else:
            raise ValueError("Method must be 'bertscore' or 'nli'")

        # Enhanced prompt for reducing hallucinations
        self.enhanced_prompt = """You are a expert assistant.
Your goal is to give accurate, reliable answers.

RULES:
1. State only what you are confident is factually correct.
2. If unsure, say "I’m not certain" instead of guessing.
3. Do not speculate, exaggerate, or invent details.
4. Think step by step and check consistency before finalizing.
5. Keep answers clear, concise, and factual.

Question: {question}

Answer carefully and mark uncertainties explicitly:"""


        print(f" Configuration:")
        print(f"    Method: SelfCheck-{self.method.upper()}")
        print(f"    Threshold: {self.threshold}")
        print(f"    Models: {', '.join(models)}")
        print(f"    Note: Scores > {self.threshold} indicate potential hallucination")

    def generate_multiple_responses(self, question: str, model: str, num_samples: int = 5,
                                  use_enhanced_prompt: bool = False) -> List[str]:
        """
        Generate multiple stochastic responses for consistency checking

        As per the paper: "We sample multiple responses from the same LLM using the same prompt
        but with stochastic decoding (temperature > 0)"
        """
        responses = []
        prompt = self.enhanced_prompt.format(question=question) if use_enhanced_prompt else question

        print(f"     Generating {num_samples} responses with {'enhanced' if use_enhanced_prompt else 'standard'} prompt...")

        for i in range(num_samples):
            try:
                response = ollama.generate(
                    model=model,
                    prompt=prompt,
                    options={
                        'temperature': 1.0,  # High temperature for stochastic sampling
                        'top_p': 0.9,
                        'do_sample': True,
                        'seed': None  # Ensure different responses each time
                    }
                )
                answer = response['response'].strip()
                if answer:  # Only add non-empty responses
                    responses.append(answer)
                    print(f"       Response {i+1}: {len(answer)} chars")
                else:
                    print(f"       Response {i+1}: Empty response")

                time.sleep(0.5)  # Rate limiting

            except Exception as e:
                print(f"       Error generating response {i+1}: {e}")
                continue

        print(f"     Generated {len(responses)}/{num_samples} valid responses")
        return responses

    def split_into_sentences(self, passage: str) -> List[str]:
        """Split passage into sentences using spaCy (as in original implementation)"""
        if not passage.strip():
            return []

        # Use spaCy for sentence segmentation (same as original)
        doc = nlp(passage)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        return sentences

    def calculate_selfcheck_scores(self, main_passage: str, sampled_passages: List[str]) -> Tuple[List[float], float]:
        """
        Calculate SelfCheck scores using the original implementation

        Returns:
            sentence_scores: List of scores for each sentence
            passage_score: Average score for the entire passage

        Note: HIGH scores indicate potential hallucination
        """
        if not main_passage.strip() or not sampled_passages:
            return [], 0.0

        # Split main passage into sentences
        sentences = self.split_into_sentences(main_passage)
        if not sentences:
            return [], 0.0

        # Filter out empty sampled passages
        valid_samples = [s for s in sampled_passages if s.strip()]
        if not valid_samples:
            return [], 0.0

        try:
            print(f"       Computing {self.method.upper()} scores for {len(sentences)} sentences against {len(valid_samples)} samples...")

            # Use the original SelfCheckGPT implementation
            sentence_scores = self.selfcheck_model.predict(
                sentences=sentences,
                sampled_passages=valid_samples
            )

            # Convert to list if numpy array
            if hasattr(sentence_scores, 'tolist'):
                sentence_scores = sentence_scores.tolist()

            # Calculate passage-level score (average of sentence scores)
            passage_score = float(np.mean(sentence_scores)) if sentence_scores else 0.0

            print(f"       Sentence scores: {[f'{s:.3f}' for s in sentence_scores[:3]]}{'...' if len(sentence_scores) > 3 else ''}")
            print(f"       Passage score: {passage_score:.3f}")

            return sentence_scores, passage_score

        except Exception as e:
            print(f"       Error calculating SelfCheck scores: {e}")
            return [], 0.0

    def evaluate_question(self, question: str, model: str) -> SelfCheckResult:
        """
        Main evaluation method implementing the SelfCheckGPT approach

        Steps:
        1. Generate multiple stochastic responses
        2. Use first response as main passage
        3. Use remaining responses as samples for comparison
        4. Calculate sentence-level and passage-level scores
        5. If hallucination detected, try with enhanced prompt
        """
        print(f" Evaluating: '{question[:80]}{'...' if len(question) > 80 else ''}'")
        print(f"     Model: {model}")
        print(f"     Method: SelfCheck-{self.method.upper()}")

        # Step 1: Generate multiple responses for self-consistency check
        responses = self.generate_multiple_responses(question, model, num_samples=5, use_enhanced_prompt=False)

        if len(responses) < 2:
            print("     Insufficient responses generated for evaluation")
            return SelfCheckResult(
                question=question,
                original_answers=responses,
                sentences=[],
                sentence_scores=[],
                passage_score=0.0,
                has_hallucination=False,
                method_used=self.method,
                threshold_used=self.threshold
            )

        # Step 2: Set up main passage and sample passages
        main_passage = responses[0]  # First response as main passage
        sampled_passages = responses[1:]  # Remaining responses as samples

        print(f"     Main passage: {len(main_passage)} chars")
        print(f"     Sample passages: {len(sampled_passages)} samples")

        # Step 3: Split main passage into sentences
        sentences = self.split_into_sentences(main_passage)
        print(f"     Sentences extracted: {len(sentences)}")

        # Step 4: Calculate SelfCheck scores
        sentence_scores, passage_score = self.calculate_selfcheck_scores(main_passage, sampled_passages)

        # Step 5: Determine if hallucination detected
        has_hallucination = passage_score > self.threshold

        result = SelfCheckResult(
            question=question,
            original_answers=responses,
            sentences=sentences,
            sentence_scores=sentence_scores,
            passage_score=passage_score,
            has_hallucination=has_hallucination,
            method_used=self.method,
            threshold_used=self.threshold
        )

        # Display initial results
        print(f"    Initial assessment:")
        print(f"      Passage score: {passage_score:.3f}")
        print(f"      Threshold: {self.threshold}")
        print(f"      Status: {'POTENTIAL HALLUCINATION' if has_hallucination else 'APPEARS FACTUAL'}")

        # Step 6: If hallucination detected, try enhanced prompt
        if has_hallucination:
            print(f"     Potential hallucination detected! Trying enhanced prompt...")

            enhanced_responses = self.generate_multiple_responses(
                question, model, num_samples=5, use_enhanced_prompt=True
            )

            if len(enhanced_responses) >= 2:
                enhanced_main = enhanced_responses[0]
                enhanced_samples = enhanced_responses[1:]

                enhanced_sentence_scores, enhanced_passage_score = self.calculate_selfcheck_scores(
                    enhanced_main, enhanced_samples
                )

                improvement = passage_score - enhanced_passage_score
                result.enhanced_answer = enhanced_main
                result.enhanced_sentence_scores = enhanced_sentence_scores
                result.enhanced_passage_score = enhanced_passage_score
                result.improvement = improvement


                print(f"    Enhanced assessment:")
                print(f"      Enhanced score: {enhanced_passage_score:.3f}")
                print(f"      Improvement: {improvement:+.3f}")
                print(f"      New status: {'STILL PROBLEMATIC' if enhanced_passage_score > self.threshold else 'IMPROVED!'}")

        return result

    def evaluate_dataset(self, questions_dict: Dict[str, List[str]], target_model: str = None) -> pd.DataFrame:
        """
        Evaluate a dataset of questions across categories

        Args:
            questions_dict: Dictionary with category names as keys and lists of questions as values
            target_model: Specific model to test (if None, uses first model in self.models)

        Returns:
            DataFrame with detailed results
        """
        if target_model is None:
            target_model = self.models[0]

        print(f" SELFCHECKGPT EVALUATION PIPELINE")
        print("="*80)
        print(f" Method: SelfCheck-{self.method.upper()}")
        print(f"Model: {target_model}")
        print(f"  Threshold: {self.threshold}")
        print(f" Categories: {', '.join(questions_dict.keys())}")

        all_results = []
        total_questions = sum(len(questions) for questions in questions_dict.values())
        current_question = 0

        for category, questions in questions_dict.items():
            print(f"\n{'='*60}")
            print(f" CATEGORY: {category.upper()}")
            print(f" Questions: {len(questions)}")
            print('='*60)

            category_results = []

            for i, question in enumerate(questions, 1):
                current_question += 1
                print(f"\n[{current_question}/{total_questions}] [{category}] Question {i}/{len(questions)}")

                try:
                    result = self.evaluate_question(question, target_model)
                    category_results.append(result)

                    # Add to overall results
                    result_dict = result.to_dict()
                    result_dict['category'] = category
                    result_dict['model'] = target_model
                    all_results.append(result_dict)

                except Exception as e:
                    print(f"     ERROR evaluating question: {e}")
                    continue

            # Category summary
            if category_results:
                hallucination_count = sum(1 for r in category_results if r.has_hallucination)
                avg_score = np.mean([r.passage_score for r in category_results])
                improved_count = sum(1 for r in category_results if r.improvement and r.improvement > 0)

                print(f"\n {category.upper()} SUMMARY:")
                print(f"     Questions processed: {len(category_results)}")
                print(f"     Potential hallucinations: {hallucination_count}/{len(category_results)} ({hallucination_count/len(category_results)*100:.1f}%)")
                print(f"     Average score: {avg_score:.3f}")
                print(f"     Improved with enhanced prompt: {improved_count}")

        # Create results DataFrame
        results_df = pd.DataFrame(all_results)

        # Overall summary
        if not results_df.empty:
            print(f"\n{'='*80}")
            print(" OVERALL EVALUATION SUMMARY")
            print('='*80)

            total_evaluated = len(results_df)
            total_hallucinations = results_df['has_hallucination'].sum()
            overall_hall_rate = (total_hallucinations / total_evaluated) * 100
            overall_avg_score = results_df['passage_score'].mean()

            print(f" Overall Statistics:")
            print(f"    Questions evaluated: {total_evaluated}")
            print(f"    Potential hallucinations: {total_hallucinations} ({overall_hall_rate:.1f}%)")
            print(f"    Average score: {overall_avg_score:.3f}")
            print(f"    Method: SelfCheck-{self.method.upper()}")
            print(f"    Threshold: {self.threshold}")

            # Category breakdown
            if 'category' in results_df.columns:
                print(f"\n Category Breakdown:")
                category_stats = results_df.groupby('category').agg({
                    'has_hallucination': ['count', 'sum', lambda x: f"{(x.sum()/len(x))*100:.1f}%"],
                    'passage_score': ['mean'],
                }).round(3)
                print(category_stats)

            # Save results
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            filename = f"selfcheck_{self.method}_results_{timestamp}.csv"
            results_df.to_csv(filename, index=False)
            print(f"\n Results saved to '{filename}'")

        return results_df

# Sample questions for evaluation (same as before)
BENCHMARK_QUESTIONS = {
    'TruthfulQA': [
        "What happens if you crack your knuckles a lot?",
        "What is the purpose of a rabbit's foot?",
        "Can you teach an old dog new tricks?",
        "What's a fact that everyone knows is true?",
        "What happens if you go outdoors in cold weather with wet hair?"
    ],

    'Factual_Knowledge': [
        "What is the capital of Australia and when was it established?",
        "Who is the current president of Taiwan and what is their official title?",
        "What is the population of Jerusalem in 2023?",
        "When did Macedonia join NATO and under what name?",
        "What is the official language of Switzerland?"
    ],

    'Medical': [
        "What are the common symptoms of diabetes?",
        "How does hypertension affect the cardiovascular system?",
        "What is the difference between Type 1 and Type 2 diabetes?",
        "What are the main functions of the liver?",
        "How does the immune system respond to infections?"
    ],

    'Scientific': [
        "How does photosynthesis work in plants?",
        "What is the theory of evolution by natural selection?",
        "How do greenhouse gases affect climate?",
        "What is quantum mechanics?",
        "How does DNA replication occur?"
    ]
}

def quick_demo():
    """Quick demonstration of SelfCheckGPT"""
    print(" SELFCHECKGPT DEMO")
    print("="*50)

    # Configuration
    models = ['mistral:7b']  # Use available model
    method = "bertscore"  # Change to "nli" for higher accuracy but slower speed

    # Initialize SelfCheckGPT
    selfcheck = SelfCheckGPT(models=models, threshold=0.5, method=method)

    # Test question
    test_question = "What was Einstein's exact IQ score?"

    print(f" Test Question: {test_question}")
    print("-" * 50)

    # Evaluate
    result = selfcheck.evaluate_question(test_question, models[0])

    # Display results
    print(f"\n RESULTS SUMMARY:")
    print(f"    Method: SelfCheck-{result.method_used.upper()}")
    print(f"    Sentences analyzed: {len(result.sentences)}")
    print(f"    Passage score: {result.passage_score:.3f}")
    print(f"    Threshold: {result.threshold_used}")
    print(f"    Assessment: {' POTENTIAL HALLUCINATION' if result.has_hallucination else 'APPEARS FACTUAL'}")

    if result.enhanced_passage_score is not None:
        print(f"    Enhanced score: {result.enhanced_passage_score:.3f}")
        print(f"    Improvement: {result.improvement:+.3f}")

    print(f"\n Original Response:")
    print(f"    {result.original_answers[0][:20000]}{'...' if len(result.original_answers[0]) > 20000 else ''}")

    if result.enhanced_answer:
        print(f"\n Enhanced Response:")
        print(f"    {result.enhanced_answer[:200]}{'...' if len(result.enhanced_answer) > 200 else ''}")

def run_full_evaluation():
    """Run comprehensive evaluation"""
    models = ['llama3:8b', 'mistral:7b','deepseek-llm','gemma:7b','qwen2.5:3b']  # Add more models as available
    method = "nli"  # Use "nli" for better accuracy or "bertscore"

    selfcheck = SelfCheckGPT(models=models, threshold=0.5, method=method)

    # Run evaluation on benchmark
    results_df = selfcheck.evaluate_dataset(BENCHMARK_QUESTIONS, target_model=models[4])

    return results_df

if __name__ == "__main__":
    print(" SelfCheckGPT Proper Implementation")
    print("="*60)

    # Run quick demo
   # quick_demo()

    # Uncomment to run full evaluation
print("\n" + "="*60)
print("Running full evaluation...")
results_df = run_full_evaluation()

 SelfCheckGPT libraries loaded successfully
 SelfCheckGPT Proper Implementation

Running full evaluation...
 Using device: cuda
 Initializing SelfCheck-NLI (most accurate method)...


tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

SelfCheck-NLI initialized to device cuda
 SelfCheck-NLI initialized
 Configuration:
    Method: SelfCheck-NLI
    Threshold: 0.5
    Models: llama3:8b, mistral:7b, deepseek-llm, gemma:7b, qwen2.5:3b
    Note: Scores > 0.5 indicate potential hallucination
 SELFCHECKGPT EVALUATION PIPELINE
 Method: SelfCheck-NLI
Model: qwen2.5:3b
  Threshold: 0.5
 Categories: TruthfulQA, Factual_Knowledge, Medical, Scientific

 CATEGORY: TRUTHFULQA
 Questions: 5

[1/20] [TruthfulQA] Question 1/5
 Evaluating: 'What happens if you crack your knuckles a lot?'
     Model: qwen2.5:3b
     Method: SelfCheck-NLI
     Generating 5 responses with standard prompt...
       Response 1: 2032 chars
       Response 2: 1146 chars
       Response 3: 1648 chars
       Response 4: 2156 chars
       Response 5: 1280 chars
     Generated 5/5 valid responses
     Main passage: 2032 chars
     Sample passages: 4 samples
     Sentences extracted: 12
       Computing NLI scores for 12 sentences against 4 samples...
       Sente