# Install required libraries

In [1]:
!pip install wikipedia-api sentence-transformers transformers scikit-learn nltk spacy requests beautifulsoup4 -q
!python -m spacy download en_core_web_sm -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Training Siamese Network

## Import

In [2]:
import os
import re
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Config

In [3]:
tf.random.set_seed(42)
np.random.seed(42)

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices('GPU'))

# Enable mixed precision for faster GPU training (if GPU supports it)
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
print("Mixed precision policy:", mixed_precision.global_policy())


TF version: 2.19.0
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Mixed precision policy: <DTypePolicy "mixed_float16">


## Data Loading & Preprocessing

In [4]:
CSV_PATH = "questions.csv"   # your file
MAX_FEATURES = 50000
MAX_LEN = 30

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Loading data...")
df = pd.read_csv(CSV_PATH)
df['question1_clean'] = df['question1'].apply(preprocess_text)
df['question2_clean'] = df['question2'].apply(preprocess_text)
df = df[(df['question1_clean'] != '') & (df['question2_clean'] != '')]

all_qs = pd.concat([df['question1_clean'], df['question2_clean']], ignore_index=True)
tokenizer = Tokenizer(num_words=MAX_FEATURES, lower=True)
tokenizer.fit_on_texts(all_qs)

q1_seq = tokenizer.texts_to_sequences(df['question1_clean'])
q2_seq = tokenizer.texts_to_sequences(df['question2_clean'])
q1_data = pad_sequences(q1_seq, maxlen=MAX_LEN)
q2_data = pad_sequences(q2_seq, maxlen=MAX_LEN)

labels = df['is_duplicate'].values.astype('int32')

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    q1_data, q2_data, labels, test_size=0.2, random_state=42, stratify=labels
)

print("Train size:", len(X1_train), "Val size:", len(X1_test))
vocab_size = min(MAX_FEATURES, len(tokenizer.word_index) + 1)


Loading data...
Train size: 79991 Val size: 19998


## Data Split

In [5]:
BATCH_SIZE = 64   # increase to 128 if GPU memory allows
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices(((X1_train, X2_train), y_train))
train_ds = train_ds.shuffle(buffer_size=10000).batch(BATCH_SIZE).prefetch(AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices(((X1_test, X2_test), y_test))
val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

## Model

In [6]:
def create_base_network(input_shape, vocab_size, embed_dim=100):
    inp = Input(shape=input_shape)
    x = Embedding(vocab_size, embed_dim, input_length=input_shape[0])(inp)
    # Use standard LSTM without recurrent_dropout so CuDNN kernel is used on GPU
    x = LSTM(64, return_sequences=True, dropout=0.2)(x)
    x = LSTM(32, dropout=0.2)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(16, activation='relu')(x)
    return Model(inp, x)

def euclidean_distance(vects):
    x, y = vects
    # Cast to float32 for safe loss computation when using mixed precision
    x = tf.cast(x, tf.float32)
    y = tf.cast(y, tf.float32)
    sum_sq = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_sq, tf.keras.backend.epsilon()))

def contrastive_loss(y_true, y_pred, margin=1.0):
    # ensure computations in float32
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    square_pred = tf.square(y_pred)
    margin_sq = tf.square(tf.maximum(margin - y_pred, 0.0))
    return tf.reduce_mean(y_true * square_pred + (1.0 - y_true) * margin_sq)

input_shape = (MAX_LEN,)
base_network = create_base_network(input_shape, vocab_size, embed_dim=100)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=(1,))([processed_a, processed_b])

siamese_model = Model([input_a, input_b], distance)

# Optimizer: with mixed precision, using Adam is fine
opt = Adam(learning_rate=1e-3)
siamese_model.compile(loss=contrastive_loss, optimizer=opt)  # no accuracy metric (not meaningful for distance)

siamese_model.summary()



## Timing and Callback

In [7]:
class TimeCallback(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start = time.time()
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch} time: {time.time() - self.epoch_start:.2f}s")

time_cb = TimeCallback()
early_stopping = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_siamese_model_gpu.h5', monitor='val_loss', save_best_only=True)

## Train

In [8]:
EPOCHS = 20
history = siamese_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[time_cb, early_stopping, checkpoint],
    verbose=1
)

Epoch 1/20
[1m1248/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ[0m [1m0s[0m 17ms/step - loss: 0.2386



Epoch 0 time: 33.73s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m34s[0m 19ms/step - loss: 0.2385 - val_loss: 0.2703
Epoch 2/20
[1m1248/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ[0m [1m0s[0m 17ms/step - loss: 0.1670



Epoch 1 time: 23.85s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m24s[0m 19ms/step - loss: 0.1669 - val_loss: 0.2399
Epoch 3/20
[1m1249/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ[0m [1m0s[0m 16ms/step - loss: 0.1399



Epoch 2 time: 39.88s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m40s[0m 19ms/step - loss: 0.1399 - val_loss: 0.2179
Epoch 4/20
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.1189



Epoch 3 time: 23.31s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m23s[0m 19ms/step - loss: 0.1189 - val_loss: 0.2053
Epoch 5/20
[1m1248/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ[0m [1m0s[0m 16ms/step - loss: 0.1027Epoch 4 time: 21.84s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m22s[0m 17ms/step - loss: 0.1027 - val_loss: 0.2093
Epoch 6/20
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.0907Epoch 5 time: 22.66s
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m23s[0m 18ms/step - loss: 0.0907 - val_loss: 0.2100
Epoch 7/20
[1m1250/1250[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step - lo

## Evaluate

In [9]:
import numpy as np
preds = siamese_model.predict([X1_test, X2_test], batch_size=BATCH_SIZE)
# choose threshold by inspection; here 0.5 like before
pred_labels = (preds.flatten() < 0.5).astype(int)
acc = np.mean(pred_labels == y_test)
print("Manual val accuracy (threshold=0.5):", acc)


[1m313/313[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 6ms/step
Manual val accuracy (threshold=0.5): 0.6932193219321933


## Save

In [10]:
siamese_model.save('siamese_question_similarity_model_gpu.h5')
import pickle
with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)



# Load Model and Tokenizer

## Install Libraries

In [11]:
# Run this cell first to install all dependencies
import subprocess
import sys

def install_requirements():
    """Install all required packages"""
    packages = [
        'nltk',
        'spacy',
        'sentence-transformers',
        'wikipediaapi',
        'beautifulsoup4',
        'requests',
        'scikit-learn',
        'numpy',
        'pandas'
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"‚úì {package} installed")
        except Exception as e:
            print(f"Warning: Could not install {package}: {e}")

    # Install spacy English model
    try:
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
        print("‚úì Spacy English model installed")
    except Exception as e:
        print(f"Warning: Could not install spacy model: {e}")

# Uncomment the line below to install requirements
install_requirements()

# Download NLTK data
import nltk
nltk_downloads = [
    'punkt_tab',
    'punkt',
    'stopwords',
    'wordnet',
    'averaged_perceptron_tagger',
    'maxent_ne_chunker',
    'words',
    'omw-1.4',
    'vader_lexicon'
]

for item in nltk_downloads:
    try:
        nltk.download(item, quiet=True)
        print(f"‚úì {item} downloaded")
    except Exception as e:
        print(f"Warning: Could not download {item}: {e}")

‚úì nltk installed
‚úì spacy installed
‚úì sentence-transformers installed
‚úì beautifulsoup4 installed
‚úì requests installed
‚úì scikit-learn installed
‚úì numpy installed
‚úì pandas installed
‚úì Spacy English model installed
‚úì punkt_tab downloaded
‚úì punkt downloaded
‚úì stopwords downloaded
‚úì wordnet downloaded
‚úì averaged_perceptron_tagger downloaded
‚úì maxent_ne_chunker downloaded
‚úì words downloaded
‚úì omw-1.4 downloaded
‚úì vader_lexicon downloaded


# Load Model

In [12]:
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any
import json
import re
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

# TensorFlow and Keras imports
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# For Wikipedia search and web scraping
import wikipediaapi
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

# For text processing and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.tree import Tree
import spacy

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

# Load spacy model for NER and POS tagging
nlp = spacy.load("en_core_web_sm")

# Initialize sentence transformer for semantic similarity (backup method)
print("Loading pre-trained sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

class SiameseNetworkLoader:
    """
    Loads and manages the trained Siamese network for similarity computation.
    """

    def __init__(self, model_path='best_siamese_model_gpu.h5', tokenizer_path='tokenizer.pickle', max_len=40):
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path
        self.max_len = max_len
        self.model = None
        self.tokenizer = None

        self.load_model_and_tokenizer()

    def load_model_and_tokenizer(self):
        """Load the trained Siamese model and tokenizer"""
        try:
            print(f"Loading Siamese model from {self.model_path}...")

            # Define the contrastive loss function for loading
            def contrastive_loss(y_true, y_pred, margin=1.0):
                y_true = tf.cast(y_true, tf.float32)
                square_pred = tf.square(y_pred)
                margin_square = tf.square(tf.maximum(margin - y_pred, 0))
                return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

            # Load model with custom objects
            self.model = load_model(
                self.model_path,
                custom_objects={'contrastive_loss': contrastive_loss},
                compile=False
            )

            print(f"Loading tokenizer from {self.tokenizer_path}...")
            with open(self.tokenizer_path, 'rb') as handle:
                self.tokenizer = pickle.load(handle)

            print("Siamese model and tokenizer loaded successfully!")

        except Exception as e:
            print(f"Error loading Siamese model: {e}")
            print("Falling back to sentence transformer for similarity computation.")
            self.model = None
            self.tokenizer = None

    def preprocess_text(self, text):
        """Preprocess text similar to training preprocessing"""
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def compute_siamese_similarity(self, text1, text2):
        """
        Compute similarity using the trained Siamese network.
        Returns similarity score (0-1) where 1 means very similar.
        """
        if self.model is None or self.tokenizer is None:
            return None

        try:
            # Preprocess texts
            text1_clean = self.preprocess_text(text1)
            text2_clean = self.preprocess_text(text2)

            if not text1_clean or not text2_clean:
                return 0.0

            # Convert to sequences
            seq1 = self.tokenizer.texts_to_sequences([text1_clean])
            seq2 = self.tokenizer.texts_to_sequences([text2_clean])

            # Pad sequences
            padded1 = pad_sequences(seq1, maxlen=self.max_len, padding='post', truncating='post')
            padded2 = pad_sequences(seq2, maxlen=self.max_len, padding='post', truncating='post')

            # Predict using Siamese network
            if hasattr(self.model, 'predict'):
                # For binary classification model (sigmoid output)
                try:
                    similarity_score = self.model.predict([padded1, padded2], verbose=0)[0][0]
                    return float(similarity_score)
                except:
                    # For distance-based model (contrastive loss)
                    distance = self.model.predict([padded1, padded2], verbose=0)[0][0]
                    # Convert distance to similarity (inverse relationship)
                    similarity = 1 / (1 + distance)
                    return float(similarity)
            else:
                return None

        except Exception as e:
            print(f"Error in Siamese similarity computation: {e}")
            return None

# Initialize the Siamese network loader
siamese_loader = SiameseNetworkLoader()

Loading pre-trained sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading Siamese model from best_siamese_model_gpu.h5...
Error loading Siamese model: Could not locate function 'euclidean_distance'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': 'builtins', 'class_name': 'function', 'config': 'euclidean_distance', 'registered_name': 'function'}
Falling back to sentence transformer for similarity computation.


# Fact Checker

In [13]:
class WikipediaFactChecker:
    """
    Checks facts against Wikipedia without any model training.
    Extracts claims and verifies them against Wikipedia content.
    Target: 90% factual accuracy for non-hallucinated content.
    """

    def __init__(self):
        self.wiki = wikipediaapi.Wikipedia('HallucinationDetector/1.0', 'en')
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def extract_claims_and_entities(self, text: str) -> Dict[str, List]:
        """Extract factual claims, entities, and key information from text"""
        doc = nlp(text)

        # Extract named entities
        entities = []
        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_
            })

        # Extract factual claims (sentences with facts)
        claims = []
        sentences = sent_tokenize(text)
        for sent in sentences:
            sent_doc = nlp(sent)
            # Check if sentence contains factual elements
            has_entity = any(ent for ent in sent_doc.ents)
            has_number = any(token.like_num for token in sent_doc)
            has_date = any(ent.label_ == 'DATE' for ent in sent_doc.ents)

            if has_entity or has_number or has_date:
                claims.append(sent)

        # Extract key noun phrases for additional context
        noun_phrases = []
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) <= 4:  # Limit to reasonable length
                noun_phrases.append(chunk.text)

        return {
            'entities': entities,
            'claims': claims,
            'noun_phrases': list(set(noun_phrases))
        }

    def search_wikipedia_comprehensive(self, query: str) -> Dict[str, Any]:
        """Comprehensive Wikipedia search with fallback mechanisms"""
        results = {'found': False, 'content': '', 'summary': '', 'title': ''}

        try:
            # Method 1: Direct page fetch
            page = self.wiki.page(query)
            if page.exists():
                results = {
                    'found': True,
                    'title': page.title,
                    'summary': page.summary[:3000],
                    'content': page.text[:10000],  # Get more content for verification
                    'url': page.fullurl
                }
                return results
        except:
            pass

        try:
            # Method 2: Wikipedia API search
            search_url = "https://en.wikipedia.org/w/api.php"
            params = {
                'action': 'query',
                'format': 'json',
                'list': 'search',
                'srsearch': query,
                'srlimit': 5
            }

            response = requests.get(search_url, params=params, timeout=5)
            data = response.json()

            if data.get('query', {}).get('search'):
                # Try first few results
                for result in data['query']['search'][:3]:
                    title = result['title']
                    page = self.wiki.page(title)
                    if page.exists():
                        results = {
                            'found': True,
                            'title': page.title,
                            'summary': page.summary[:3000],
                            'content': page.text[:10000],
                            'url': page.fullurl
                        }
                        return results
        except:
            pass

        return results

    def verify_claim_against_wikipedia(self, claim: str, wiki_content: str) -> float:
        """
        Verify a specific claim against Wikipedia content.
        Returns confidence score (0-1) where 1 means fully verified.
        """
        if not wiki_content:
            return 0.0

        # Tokenize and process claim
        claim_lower = claim.lower()
        claim_tokens = word_tokenize(claim_lower)
        claim_lemmas = [self.lemmatizer.lemmatize(token) for token in claim_tokens
                       if token not in self.stop_words and token.isalnum()]

        # Extract key information from claim
        claim_doc = nlp(claim)
        claim_entities = [ent.text.lower() for ent in claim_doc.ents]
        claim_numbers = [token.text for token in claim_doc if token.like_num]

        # Split Wikipedia content into sentences for detailed checking
        wiki_sentences = sent_tokenize(wiki_content.lower())

        max_score = 0.0
        entity_match_score = 0.0
        number_match_score = 0.0

        for wiki_sent in wiki_sentences:
            # Token overlap score
            wiki_tokens = word_tokenize(wiki_sent)
            wiki_lemmas = [self.lemmatizer.lemmatize(token) for token in wiki_tokens
                          if token not in self.stop_words and token.isalnum()]

            if claim_lemmas and wiki_lemmas:
                overlap = len(set(claim_lemmas) & set(wiki_lemmas)) / len(set(claim_lemmas))
            else:
                overlap = 0.0

            # Entity matching
            entity_matches = sum(1 for ent in claim_entities if ent in wiki_sent)
            if claim_entities:
                entity_score = entity_matches / len(claim_entities)
                entity_match_score = max(entity_match_score, entity_score)

            # Number matching
            number_matches = sum(1 for num in claim_numbers if num in wiki_sent)
            if claim_numbers:
                num_score = number_matches / len(claim_numbers)
                number_match_score = max(number_match_score, num_score)

            # Combined score for this sentence
            sent_score = overlap
            if claim_entities:
                sent_score = 0.6 * overlap + 0.4 * entity_score
            if claim_numbers:
                sent_score = 0.5 * sent_score + 0.5 * num_score

            max_score = max(max_score, sent_score)

        return max_score

    def check_facts(self, ai_output: str) -> Tuple[float, Dict[str, Any]]:
        """
        Main method to check facts in AI output against Wikipedia.
        Returns accuracy score (0-1) and detailed results.
        Target: 90% accuracy for non-hallucinated content.
        """
        # Extract claims and entities
        extracted = self.extract_claims_and_entities(ai_output)
        entities = extracted['entities']
        claims = extracted['claims']
        noun_phrases = extracted['noun_phrases']

        if not claims and not entities:
            return 0.5, {'error': 'No factual content to verify'}

        print(f"Extracted {len(claims)} claims and {len(entities)} entities for verification")

        # Search Wikipedia for main topics
        wiki_contents = []
        wiki_pages_found = []

        # Search for entities
        for entity in entities[:5]:  # Limit to top 5 entities
            result = self.search_wikipedia_comprehensive(entity['text'])
            if result['found']:
                wiki_contents.append(result['content'])
                wiki_pages_found.append(result['title'])

        # If no entities found, try noun phrases
        if not wiki_contents and noun_phrases:
            for phrase in noun_phrases[:3]:
                result = self.search_wikipedia_comprehensive(phrase)
                if result['found']:
                    wiki_contents.append(result['content'])
                    wiki_pages_found.append(result['title'])

        # Combine all Wikipedia content
        combined_wiki_content = ' '.join(wiki_contents)

        if not combined_wiki_content:
            # Try searching for the overall topic
            doc = nlp(ai_output)
            main_topics = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']][:5]
            main_topic = ' '.join(main_topics)
            result = self.search_wikipedia_comprehensive(main_topic)
            if result['found']:
                combined_wiki_content = result['content']
                wiki_pages_found.append(result['title'])

        # Verify each claim
        claim_scores = []
        claim_results = []

        for claim in claims:
            score = self.verify_claim_against_wikipedia(claim, combined_wiki_content)
            claim_scores.append(score)
            claim_results.append({
                'claim': claim[:100] + '...' if len(claim) > 100 else claim,
                'verification_score': score,
                'verified': score >= 0.6  # Individual claim threshold
            })

        # Calculate overall factual accuracy
        if claim_scores:
            factual_accuracy = np.mean(claim_scores)
        else:
            factual_accuracy = 0.5  # Neutral if no claims to verify

        # Check if meets 90% accuracy threshold
        is_factually_accurate = factual_accuracy >= 0.9

        details = {
            'entities_found': [e['text'] for e in entities],
            'wikipedia_pages': wiki_pages_found,
            'num_claims_checked': len(claims),
            'claim_results': claim_results,
            'factual_accuracy': factual_accuracy,
            'meets_90_percent_threshold': is_factually_accurate,
            'num_verified_claims': sum(1 for c in claim_results if c['verified'])
        }

        # Return hallucination score (inverse of accuracy)
        hallucination_score = 1.0 - factual_accuracy
        return hallucination_score, details


# Prompt - Output Similarity

In [14]:
class SiamesePromptOutputSimilarityChecker:
    """
    Enhanced similarity checker using the trained Siamese network.
    Falls back to traditional methods if Siamese model is unavailable.
    """

    def __init__(self):
        self.siamese_loader = siamese_loader
        self.sentence_model = sentence_model
        self.tfidf = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))

    def compute_similarities(self, prompt: str, output: str) -> Dict[str, float]:
        """Compute multiple types of similarities between prompt and output"""

        # 1. Siamese Network Similarity (Primary method)
        siamese_similarity = self.siamese_loader.compute_siamese_similarity(prompt, output)

        # 2. TF-IDF Cosine Similarity (Backup method)
        try:
            tfidf_matrix = self.tfidf.fit_transform([prompt, output])
            tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            tfidf_similarity = 0.0

        # 3. Semantic Similarity using Sentence Transformers (Backup method)
        try:
            prompt_embedding = self.sentence_model.encode([prompt])
            output_embedding = self.sentence_model.encode([output])
            semantic_similarity = cosine_similarity(prompt_embedding, output_embedding)[0][0]
        except:
            semantic_similarity = 0.0

        # 4. Word Overlap Similarity
        prompt_words = set(word_tokenize(prompt.lower()))
        output_words = set(word_tokenize(output.lower()))
        if prompt_words:
            word_overlap = len(prompt_words & output_words) / len(prompt_words)
        else:
            word_overlap = 0.0

        # 5. Entity Overlap
        prompt_doc = nlp(prompt)
        output_doc = nlp(output)
        prompt_entities = set([ent.text.lower() for ent in prompt_doc.ents])
        output_entities = set([ent.text.lower() for ent in output_doc.ents])

        if prompt_entities:
            entity_overlap = len(prompt_entities & output_entities) / len(prompt_entities)
        else:
            entity_overlap = 0.5  # Neutral if no entities in prompt

        return {
            'siamese_similarity': siamese_similarity if siamese_similarity is not None else semantic_similarity,
            'tfidf_similarity': tfidf_similarity,
            'semantic_similarity': semantic_similarity,
            'word_overlap': word_overlap,
            'entity_overlap': entity_overlap,
            'using_siamese': siamese_similarity is not None
        }

    def check_contextual_consistency(self, prompt: str, output: str) -> Tuple[float, Dict[str, Any]]:
        """
        Check if the AI output is contextually consistent with the prompt.
        Uses Siamese network as primary similarity measure.
        Returns consistency score and detailed analysis.
        """
        # Compute all similarities
        similarities = self.compute_similarities(prompt, output)

        # Check for topic drift
        prompt_doc = nlp(prompt)
        output_doc = nlp(output)

        # Extract main topics (nouns and proper nouns)
        prompt_topics = set([token.lemma_.lower() for token in prompt_doc
                           if token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop])
        output_topics = set([token.lemma_.lower() for token in output_doc
                           if token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop])

        # Calculate topic drift
        new_topics = output_topics - prompt_topics
        if output_topics:
            topic_drift_ratio = len(new_topics) / len(output_topics)
        else:
            topic_drift_ratio = 0.0

        # Combined consistency score with adjusted weights
        if similarities['using_siamese']:
            # Prioritize Siamese network when available
            weights = {
                'siamese': 0.5,    # Primary method
                'semantic': 0.2,   # Backup validation
                'tfidf': 0.15,
                'word': 0.1,
                'entity': 0.05
            }
            print(f"Using Siamese network for similarity (score: {similarities['siamese_similarity']:.3f})")
        else:
            # Fall back to traditional methods
            weights = {
                'siamese': 0.0,
                'semantic': 0.4,
                'tfidf': 0.25,
                'word': 0.15,
                'entity': 0.2
            }
            print("Using traditional similarity methods (Siamese model not available)")

        consistency_score = (
            weights['siamese'] * similarities['siamese_similarity'] +
            weights['semantic'] * similarities['semantic_similarity'] +
            weights['tfidf'] * similarities['tfidf_similarity'] +
            weights['word'] * similarities['word_overlap'] +
            weights['entity'] * similarities['entity_overlap']
        )

        # Adjust for topic drift
        consistency_score = consistency_score * (1 - topic_drift_ratio * 0.3)

        # Determine if contextually consistent
        is_consistent = consistency_score > 0.5

        details = {
            **similarities,
            'topic_drift_ratio': topic_drift_ratio,
            'new_topics_introduced': list(new_topics)[:10],
            'prompt_topics': list(prompt_topics)[:10],
            'consistency_score': consistency_score,
            'is_contextually_consistent': is_consistent,
            'weights_used': weights
        }

        # Return hallucination score (inverse of consistency)
        hallucination_score = 1.0 - consistency_score
        return hallucination_score, details

# Paraphrase Consistency Checker

In [15]:
class ParaphraseConsistencyChecker:
    """
    Generates paraphrases and checks consistency using Siamese network.
    Falls back to sentence transformers if Siamese model unavailable.
    """

    def __init__(self):
        self.siamese_loader = siamese_loader
        self.sentence_model = sentence_model
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def get_synonyms(self, word: str, pos: str = None) -> List[str]:
        """Get synonyms for a word using WordNet"""
        synonyms = []
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                if lemma.name() != word and '_' not in lemma.name():
                    synonyms.append(lemma.name())
                    if len(synonyms) >= 3:
                        return synonyms
        return synonyms

    def paraphrase_by_synonym_replacement(self, text: str) -> str:
        """Generate paraphrase by replacing words with synonyms"""
        doc = nlp(text)
        paraphrased_tokens = []

        for token in doc:
            # Skip punctuation, stop words, and proper nouns
            if token.is_punct or token.is_stop or token.pos_ == 'PROPN':
                paraphrased_tokens.append(token.text)
                continue

            # Get WordNet POS tag
            wn_pos = None
            if token.pos_ == 'NOUN':
                wn_pos = wordnet.NOUN
            elif token.pos_ == 'VERB':
                wn_pos = wordnet.VERB
            elif token.pos_ == 'ADJ':
                wn_pos = wordnet.ADJ
            elif token.pos_ == 'ADV':
                wn_pos = wordnet.ADV

            # Try to get synonyms
            if wn_pos:
                synonyms = self.get_synonyms(token.text.lower(), wn_pos)
                if synonyms and np.random.random() > 0.5:  # Replace 50% of eligible words
                    paraphrased_tokens.append(synonyms[0])
                else:
                    paraphrased_tokens.append(token.text)
            else:
                paraphrased_tokens.append(token.text)

        return ' '.join(paraphrased_tokens)

    def paraphrase_by_sentence_restructuring(self, text: str) -> str:
        """Generate paraphrase by restructuring sentences"""
        sentences = sent_tokenize(text)
        paraphrased_sentences = []

        for sent in sentences:
            doc = nlp(sent)

            # Try to identify sentence structure
            subjects = [token for token in doc if token.dep_ == "nsubj"]
            verbs = [token for token in doc if token.pos_ == "VERB"]
            objects = [token for token in doc if token.dep_ in ["dobj", "pobj"]]

            # Simple restructuring based on sentence type
            if len(sent.split()) < 5:
                # Keep short sentences as is
                paraphrased_sentences.append(sent)
            elif subjects and verbs:
                # Try synonym replacement for this sentence
                paraphrased = self.paraphrase_by_synonym_replacement(sent)
                paraphrased_sentences.append(paraphrased)
            else:
                paraphrased_sentences.append(sent)

        # Optionally reorder middle sentences
        if len(paraphrased_sentences) > 3:
            first = paraphrased_sentences[0]
            last = paraphrased_sentences[-1]
            middle = paraphrased_sentences[1:-1]
            np.random.shuffle(middle)
            paraphrased_sentences = [first] + middle + [last]

        return ' '.join(paraphrased_sentences)

    def generate_paraphrases(self, text: str, num_paraphrases: int = 3) -> List[str]:
        """Generate multiple paraphrases using different methods"""
        paraphrases = []

        # Method 1: Synonym replacement
        for i in range(num_paraphrases // 2 + 1):
            para = self.paraphrase_by_synonym_replacement(text)
            if para != text:  # Only add if different
                paraphrases.append(para)

        # Method 2: Sentence restructuring
        for i in range(num_paraphrases // 2):
            para = self.paraphrase_by_sentence_restructuring(text)
            if para != text:
                paraphrases.append(para)

        # Ensure we have enough paraphrases
        while len(paraphrases) < num_paraphrases:
            # Create variation by combining methods
            temp = self.paraphrase_by_synonym_replacement(text)
            para = self.paraphrase_by_sentence_restructuring(temp)
            paraphrases.append(para)

        return paraphrases[:num_paraphrases]

    def compute_similarity_matrix_siamese(self, texts: List[str]) -> np.ndarray:
        """Compute similarity matrix using Siamese network"""
        n = len(texts)
        similarity_matrix = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                elif i < j:  # Compute only upper triangle
                    sim = self.siamese_loader.compute_siamese_similarity(texts[i], texts[j])
                    if sim is not None:
                        similarity_matrix[i][j] = sim
                        similarity_matrix[j][i] = sim  # Symmetric
                    else:
                        # Fall back to sentence transformer
                        embeddings = self.sentence_model.encode([texts[i], texts[j]])
                        sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
                        similarity_matrix[i][j] = sim
                        similarity_matrix[j][i] = sim

        return similarity_matrix

    def check_paraphrase_consistency(self, ai_output: str) -> Tuple[float, Dict[str, Any]]:
        """
        Generate paraphrases and check consistency using Siamese network.
        High consistency indicates non-hallucinated content.
        """
        # Generate paraphrases
        paraphrases = self.generate_paraphrases(ai_output, num_paraphrases=3)

        # Add original to the list for comparison
        all_versions = [ai_output] + paraphrases

        # Compute similarity matrix using Siamese network
        if self.siamese_loader.model is not None:
            similarity_matrix = self.compute_similarity_matrix_siamese(all_versions)
            using_siamese = True
            print("Using Siamese network for paraphrase consistency check")
        else:
            # Fall back to sentence transformers
            embeddings = self.sentence_model.encode(all_versions)
            similarity_matrix = cosine_similarity(embeddings)
            using_siamese = False
            print("Using sentence transformers for paraphrase consistency check")

        # Extract similarity scores (excluding self-similarity)
        similarities = []
        for i in range(len(all_versions)):
            for j in range(i + 1, len(all_versions)):
                similarities.append(similarity_matrix[i][j])

        # Calculate consistency metrics
        avg_similarity = np.mean(similarities) if similarities else 0.0
        min_similarity = np.min(similarities) if similarities else 0.0
        std_similarity = np.std(similarities) if similarities else 0.0

        # Check factual consistency across paraphrases
        all_facts = []
        for version in all_versions:
            doc = nlp(version)
            facts = {
                'entities': set([ent.text.lower() for ent in doc.ents]),
                'numbers': set([token.text for token in doc if token.like_num]),
                'dates': set([ent.text for ent in doc.ents if ent.label_ == 'DATE'])
            }
            all_facts.append(facts)

        # Calculate fact preservation
        if all_facts:
            # Check entity consistency
            all_entities = [f['entities'] for f in all_facts]
            if all_entities and all_entities[0]:
                common_entities = set.intersection(*all_entities)
                entity_consistency = len(common_entities) / len(all_entities[0])
            else:
                entity_consistency = 1.0

            # Check number consistency
            all_numbers = [f['numbers'] for f in all_facts]
            if all_numbers and all_numbers[0]:
                common_numbers = set.intersection(*all_numbers)
                number_consistency = len(common_numbers) / len(all_numbers[0])
            else:
                number_consistency = 1.0
        else:
            entity_consistency = 1.0
            number_consistency = 1.0

        # Combined consistency score
        weights = {
            'semantic': 0.4,
            'entity': 0.3,
            'number': 0.2,
            'stability': 0.1  # Low std means stable
        }

        stability_score = 1.0 - min(std_similarity, 1.0)  # Convert std to stability

        consistency_score = (
            weights['semantic'] * avg_similarity +
            weights['entity'] * entity_consistency +
            weights['number'] * number_consistency +
            weights['stability'] * stability_score
        )

        # High consistency suggests non-hallucinated content
        is_consistent = consistency_score > 0.7

        details = {
            'num_paraphrases': len(paraphrases),
            'avg_similarity': avg_similarity,
            'min_similarity': min_similarity,
            'std_similarity': std_similarity,
            'entity_consistency': entity_consistency,
            'number_consistency': number_consistency,
            'consistency_score': consistency_score,
            'is_consistent': is_consistent,
            'using_siamese': using_siamese,
            'sample_paraphrase': paraphrases[0][:200] if paraphrases else ''
        }

        # Return hallucination score (inverse of consistency)
        hallucination_score = 1.0 - consistency_score
        return hallucination_score, details

# Hallucination Data Class

In [16]:
@dataclass
class HallucinationResult:
    """Final result of hallucination detection"""
    is_hallucination: bool
    confidence: float
    factual_score: float
    contextual_score: float
    consistency_score: float
    overall_score: float
    details: Dict[str, Any]


# Multilevel Multimodal Hallucination Detection

In [17]:
class MultiLayeredHallucinationDetector:
    """
    Main system combining all three detection methods with Siamese network integration.
    Uses the trained Siamese model for enhanced similarity computations.
    """

    def __init__(self):
        print("Initializing Multi-Layered Hallucination Detector with Siamese Network...")
        self.fact_checker = WikipediaFactChecker()
        self.similarity_checker = SiamesePromptOutputSimilarityChecker()
        self.paraphrase_checker = ParaphraseConsistencyChecker()

        # Thresholds for hallucination detection
        self.thresholds = {
            'factual': 0.1,  # Lower is better (inverse of 90% accuracy)
            'contextual': 0.5,
            'consistency': 0.3,
            'overall': 0.4
        }

        # Check if Siamese model is available
        if siamese_loader.model is not None:
            print("‚úÖ Siamese network loaded successfully - using for enhanced similarity detection!")
        else:
            print("‚ö†Ô∏è Siamese network not available - falling back to traditional methods")

        print("Detector initialized successfully!")

    def detect_hallucination(self, user_prompt: str, ai_output: str) -> HallucinationResult:
        """
        Main detection method combining all three approaches with Siamese network.

        Args:
            user_prompt: The original user input/prompt
            ai_output: The AI model's generated response

        Returns:
            HallucinationResult with detection results and confidence
        """

        print("\n" + "="*60)
        print("STARTING MULTI-LAYERED HALLUCINATION DETECTION")
        print("WITH SIAMESE NETWORK INTEGRATION")
        print("="*60)

        # Step 1: Wikipedia Fact Checking (90% accuracy requirement)
        print("\n[1/3] Wikipedia-based Factual Verification...")
        print("-" * 40)
        factual_hall_score, factual_details = self.fact_checker.check_facts(ai_output)
        print(f"‚úì Factual hallucination score: {factual_hall_score:.3f}")
        print(f"  - Wikipedia pages checked: {len(factual_details.get('wikipedia_pages', []))}")
        print(f"  - Claims verified: {factual_details.get('num_verified_claims', 0)}/{factual_details.get('num_claims_checked', 0)}")
        print(f"  - Meets 90% threshold: {'Yes' if factual_details.get('meets_90_percent_threshold', False) else 'No'}")

        # Step 2: Siamese Network Similarity Check
        print("\n[2/3] Siamese Network Prompt-Output Similarity Analysis...")
        print("-" * 40)
        contextual_hall_score, contextual_details = self.similarity_checker.check_contextual_consistency(
            user_prompt, ai_output
        )
        print(f"‚úì Contextual hallucination score: {contextual_hall_score:.3f}")
        if contextual_details.get('using_siamese', False):
            print(f"  - Siamese similarity: {contextual_details.get('siamese_similarity', 0):.3f}")
        print(f"  - Semantic similarity: {contextual_details.get('semantic_similarity', 0):.3f}")
        print(f"  - TF-IDF similarity: {contextual_details.get('tfidf_similarity', 0):.3f}")
        print(f"  - Topic drift: {contextual_details.get('topic_drift_ratio', 0):.1%}")

        # Step 3: Siamese-Enhanced Paraphrase Consistency Check
        print("\n[3/3] Siamese-Enhanced Paraphrase Consistency Check...")
        print("-" * 40)
        consistency_hall_score, consistency_details = self.paraphrase_checker.check_paraphrase_consistency(
            ai_output
        )
        print(f"‚úì Consistency hallucination score: {consistency_hall_score:.3f}")
        print(f"  - Average paraphrase similarity: {consistency_details.get('avg_similarity', 0):.3f}")
        print(f"  - Entity consistency: {consistency_details.get('entity_consistency', 0):.1%}")
        print(f"  - Number consistency: {consistency_details.get('number_consistency', 0):.1%}")
        print(f"  - Using Siamese network: {'Yes' if consistency_details.get('using_siamese', False) else 'No'}")

        # Combine scores with weights
        weights = {
            'factual': 0.5,  # Higher weight for factual accuracy
            'contextual': 0.25,
            'consistency': 0.25
        }

        overall_hall_score = (
            weights['factual'] * factual_hall_score +
            weights['contextual'] * contextual_hall_score +
            weights['consistency'] * consistency_hall_score
        )

        # Determine if it's a hallucination
        is_hallucination = (
            factual_hall_score > self.thresholds['factual'] or  # Fails 90% accuracy
            contextual_hall_score > self.thresholds['contextual'] or
            consistency_hall_score > self.thresholds['consistency'] or
            overall_hall_score > self.thresholds['overall']
        )

        # Calculate confidence based on agreement between methods
        scores = [factual_hall_score, contextual_hall_score, consistency_hall_score]
        # High agreement = high confidence
        confidence = 1.0 - np.std(scores) * 2  # Scale std deviation
        confidence = max(0.0, min(1.0, confidence))  # Clamp to [0, 1]

        # Boost confidence if Siamese network was used successfully
        siamese_used = (contextual_details.get('using_siamese', False) or
                       consistency_details.get('using_siamese', False))
        if siamese_used:
            confidence = min(1.0, confidence + 0.1)  # Small boost for Siamese usage

        # Compile all details
        all_details = {
            'factual': factual_details,
            'contextual': contextual_details,
            'consistency': consistency_details,
            'weights_used': weights,
            'thresholds_used': self.thresholds,
            'siamese_network_used': siamese_used,
            'individual_scores': {
                'factual_hallucination': factual_hall_score,
                'contextual_hallucination': contextual_hall_score,
                'consistency_hallucination': consistency_hall_score
            }
        }

        # Print final results
        print("\n" + "="*60)
        print("FINAL DETECTION RESULTS")
        print("="*60)
        print(f"Overall Hallucination Score: {overall_hall_score:.3f}")
        print(f"Decision: {'‚ö†Ô∏è HALLUCINATION DETECTED' if is_hallucination else '‚úÖ NO HALLUCINATION'}")
        print(f"Confidence: {confidence:.1%}")
        if siamese_used:
            print("üéØ Enhanced with Siamese Network")
        print("="*60)

        return HallucinationResult(
            is_hallucination=is_hallucination,
            confidence=confidence,
            factual_score=factual_hall_score,
            contextual_score=contextual_hall_score,
            consistency_score=consistency_hall_score,
            overall_score=overall_hall_score,
            details=all_details
        )

    def generate_detailed_report(self, result: HallucinationResult) -> str:
        """Generate a comprehensive report from the detection results"""
        report = []
        report.append("\n" + "="*60)
        report.append("HALLUCINATION DETECTION DETAILED REPORT")
        report.append("WITH SIAMESE NETWORK INTEGRATION")
        report.append("="*60)

        # Executive Summary
        report.append("\nüìä EXECUTIVE SUMMARY")
        report.append("-" * 40)
        report.append(f"Decision: {'‚ö†Ô∏è HALLUCINATION DETECTED' if result.is_hallucination else '‚úÖ NO HALLUCINATION'}")
        report.append(f"Overall Score: {result.overall_score:.3f} (lower is better)")
        report.append(f"Confidence: {result.confidence:.1%}")
        if result.details.get('siamese_network_used', False):
            report.append("üéØ Enhanced with Siamese Network")

        # Individual Module Results
        report.append("\nüìà MODULE-WISE ANALYSIS")
        report.append("-" * 40)

        # Module 1: Factual Verification
        report.append("\n1. WIKIPEDIA FACTUAL VERIFICATION (Weight: 50%)")
        factual = result.details.get('factual', {})
        report.append(f"   Hallucination Score: {result.factual_score:.3f}")
        report.append(f"   Factual Accuracy: {factual.get('factual_accuracy', 0):.1%}")
        report.append(f"   Meets 90% Threshold: {'Yes ‚úì' if factual.get('meets_90_percent_threshold', False) else 'No ‚úó'}")
        report.append(f"   Wikipedia Pages Consulted: {len(factual.get('wikipedia_pages', []))}")
        if factual.get('wikipedia_pages'):
            report.append(f"   Pages: {', '.join(factual['wikipedia_pages'][:3])}")
        report.append(f"   Claims Verified: {factual.get('num_verified_claims', 0)}/{factual.get('num_claims_checked', 0)}")

        # Module 2: Siamese Network Similarity
        report.append("\n2. SIAMESE NETWORK SIMILARITY ANALYSIS (Weight: 25%)")
        contextual = result.details.get('contextual', {})
        report.append(f"   Hallucination Score: {result.contextual_score:.3f}")

        if contextual.get('using_siamese', False):
            report.append(f"   üéØ Siamese Network Similarity: {contextual.get('siamese_similarity', 0):.3f}")
            report.append("   Method: Trained Siamese Network (Primary)")
        else:
            report.append("   Method: Sentence Transformers (Fallback)")

        report.append(f"   Semantic Similarity: {contextual.get('semantic_similarity', 0):.3f}")
        report.append(f"   TF-IDF Similarity: {contextual.get('tfidf_similarity', 0):.3f}")
        report.append(f"   Entity Overlap: {contextual.get('entity_overlap', 0):.1%}")
        report.append(f"   Topic Drift: {contextual.get('topic_drift_ratio', 0):.1%}")
        new_topics = contextual.get('new_topics_introduced', [])
        if new_topics:
            report.append(f"   New Topics: {', '.join(new_topics[:5])}")

        # Module 3: Siamese-Enhanced Paraphrase Consistency
        report.append("\n3. SIAMESE-ENHANCED PARAPHRASE CONSISTENCY (Weight: 25%)")
        consistency = result.details.get('consistency', {})
        report.append(f"   Hallucination Score: {result.consistency_score:.3f}")

        if consistency.get('using_siamese', False):
            report.append("   üéØ Similarity Method: Siamese Network")
        else:
            report.append("   Similarity Method: Sentence Transformers")

        report.append(f"   Paraphrase Similarity: {consistency.get('avg_similarity', 0):.3f}")
        report.append(f"   Entity Consistency: {consistency.get('entity_consistency', 0):.1%}")
        report.append(f"   Number Consistency: {consistency.get('number_consistency', 0):.1%}")
        report.append(f"   Paraphrases Generated: {consistency.get('num_paraphrases', 0)}")

        # Siamese Network Performance
        if result.details.get('siamese_network_used', False):
            report.append("\nüéØ SIAMESE NETWORK PERFORMANCE")
            report.append("-" * 40)
            report.append("‚úÖ Siamese network successfully integrated")
            report.append("‚Ä¢ Enhanced similarity detection accuracy")
            report.append("‚Ä¢ Domain-specific question similarity expertise")
            report.append("‚Ä¢ Improved contextual understanding")
        else:
            report.append("\n‚ö†Ô∏è SIAMESE NETWORK STATUS")
            report.append("-" * 40)
            report.append("‚ùå Siamese network not available")
            report.append("‚Ä¢ Using traditional similarity methods")
            report.append("‚Ä¢ Consider checking model file path")

        # Risk Assessment
        report.append("\n‚ö†Ô∏è RISK ASSESSMENT")
        report.append("-" * 40)

        risk_factors = []
        if result.factual_score > 0.1:
            risk_factors.append("‚Ä¢ High factual inaccuracy detected")
        if result.contextual_score > 0.5:
            risk_factors.append("‚Ä¢ Significant contextual drift from prompt")
        if result.consistency_score > 0.3:
            risk_factors.append("‚Ä¢ Low consistency across paraphrases")

        if risk_factors:
            for factor in risk_factors:
                report.append(factor)
        else:
            report.append("‚Ä¢ No significant risk factors detected")

        # Recommendations
        report.append("\nüí° RECOMMENDATIONS")
        report.append("-" * 40)

        if result.is_hallucination:
            report.append("‚ö†Ô∏è This output appears to contain hallucinations.")
            report.append("Suggested actions:")
            if result.factual_score > 0.1:
                report.append("‚Ä¢ Verify factual claims independently")
            if result.contextual_score > 0.5:
                report.append("‚Ä¢ Review if output addresses the original prompt")
            if result.consistency_score > 0.3:
                report.append("‚Ä¢ Check for internal contradictions")
        else:
            report.append("‚úÖ This output appears to be reliable.")
            report.append("‚Ä¢ Factual claims align with Wikipedia sources")
            report.append("‚Ä¢ Output is contextually relevant to the prompt")
            report.append("‚Ä¢ Content is internally consistent")

        if not result.details.get('siamese_network_used', False):
            report.append("‚Ä¢ Consider loading Siamese model for enhanced detection")

        report.append("\n" + "="*60)

        return '\n'.join(report)



## Hallucination Detection function

In [18]:
def detect_hallucination_interactive_siamese(user_prompt: str, ai_output: str):
    """
    Enhanced interactive hallucination detection with Siamese network integration.

    Args:
        user_prompt: The original user query/prompt
        ai_output: The AI model's response

    Returns:
        Dictionary with detection results
    """
    detector = MultiLayeredHallucinationDetector()
    result = detector.detect_hallucination(user_prompt, ai_output)

    # Generate report
    report = detector.generate_detailed_report(result)
    print(report)

    # Return summary
    return {
        'is_hallucination': result.is_hallucination,
        'confidence': result.confidence,
        'overall_score': result.overall_score,
        'factual_accuracy': 1.0 - result.factual_score,
        'contextual_relevance': 1.0 - result.contextual_score,
        'internal_consistency': 1.0 - result.consistency_score,
        'siamese_network_used': result.details.get('siamese_network_used', False),
        'siamese_similarity': result.details.get('contextual', {}).get('siamese_similarity', None)
    }



## Example usage

In [19]:
# Test the Siamese loader first
test_result = detect_hallucination_interactive_siamese(
    user_prompt="What is the step by step guide to invest in share market?",
    ai_output="How to invest in stock market step by step: Open a trading account, research companies, start with small amounts, diversify portfolio, and monitor regularly."
)

print(f"\nTest Result Summary:")
print(f"- Siamese Network Used: {test_result['siamese_network_used']}")
if test_result['siamese_similarity'] is not None:
    print(f"- Siamese Similarity: {test_result['siamese_similarity']:.3f}")


Initializing Multi-Layered Hallucination Detector with Siamese Network...
‚ö†Ô∏è Siamese network not available - falling back to traditional methods
Detector initialized successfully!

STARTING MULTI-LAYERED HALLUCINATION DETECTION
WITH SIAMESE NETWORK INTEGRATION

[1/3] Wikipedia-based Factual Verification...
----------------------------------------
‚úì Factual hallucination score: 0.500
  - Wikipedia pages checked: 0
  - Claims verified: 0/0
  - Meets 90% threshold: No

[2/3] Siamese Network Prompt-Output Similarity Analysis...
----------------------------------------
Using traditional similarity methods (Siamese model not available)
‚úì Contextual hallucination score: 0.601
  - Semantic similarity: 0.735
  - TF-IDF similarity: 0.204
  - Topic drift: 77.8%

[3/3] Siamese-Enhanced Paraphrase Consistency Check...
----------------------------------------
Using sentence transformers for paraphrase consistency check
‚úì Consistency hallucination score: 0.118
  - Average paraphrase similar