In [1]:
# Notebook 1: Setup and Data Loading
import json
import os
import faiss
import numpy as np
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import pickle
from pathlib import Path
import logging
import PyPDF2
import spacy
import json
from typing import List, Dict, Any
from pathlib import Path
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class FAISSEmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', base_path: str = 'NEW_faiss'):
        """
        Initialize the FAISS Embedding Manager
        """
        self.model_name = model_name
        self.base_path = Path(base_path)
        self.base_path.mkdir(parents=True, exist_ok=True)
        
        logging.info(f"Loading model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()
        
        self.index = faiss.IndexFlatL2(self.dimension)
        self.chunk_metadata = {}
        self.current_id = 0
        
        # Add NLP and vectorizer
        self.nlp = spacy.load("en_core_web_sm")
        self.vectorizer = TfidfVectorizer(
            max_features=100,
            stop_words='english',
            min_df=1
        )
    
    def load_chunks(self, json_path: str) -> List[Dict[str, Any]]:
        """
        Load chunks from JSON file with proper UTF-8 encoding
        """
        logging.info(f"Loading chunks from {json_path}")
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            logging.info(f"Successfully loaded {len(data['chunks'])} chunks")
            return data['chunks']
        except UnicodeDecodeError:
            logging.warning("UTF-8 decoding failed, trying with utf-8-sig encoding")
            try:
                with open(json_path, 'r', encoding='utf-8-sig') as f:
                    data = json.load(f)
                logging.info(f"Successfully loaded {len(data['chunks'])} chunks with utf-8-sig encoding")
                return data['chunks']
            except Exception as e:
                logging.error(f"Error loading chunks with utf-8-sig: {str(e)}")
                raise
        except Exception as e:
            logging.error(f"Error loading chunks: {str(e)}")
            raise
    
    def embed_chunk(self, chunk: Dict[str, Any]) -> np.ndarray:
        """
        Create embedding for a single chunk
        """
        text = chunk['content']
        try:
            embedding = self.model.encode([text])[0]
            return embedding
        except Exception as e:
            logging.error(f"Error embedding chunk {chunk['id']}: {str(e)}")
            raise
    
    def add_chunk(self, chunk: Dict[str, Any]):
        """
        Add a single chunk to the FAISS index
        """
        try:
            embedding = self.embed_chunk(chunk)
            self.index.add(embedding.reshape(1, -1))
            
            self.chunk_metadata[self.current_id] = {
                'chunk_id': chunk['id'],
                'content': chunk['content'],
                'topics': chunk.get('topics', []),
                'tokens': chunk.get('tokens', 0),
                'entities': chunk.get('entities', [])
            }
            
            self.current_id += 1
            if self.current_id % 10 == 0:  # Log progress every 10 chunks
                logging.info(f"Processed {self.current_id} chunks")
                
        except Exception as e:
            logging.error(f"Error adding chunk {chunk['id']}: {str(e)}")
            raise
    
    def add_chunks(self, chunks: List[Dict[str, Any]]):
        """Add multiple chunks to the FAISS index"""
        logging.info(f"Adding {len(chunks)} chunks to FAISS index")
        
        # First, fit vectorizer on all chunk contents
        all_contents = [chunk['content'] for chunk in chunks]
        self.vectorizer.fit(all_contents)
        
        # Then add chunks
        for chunk in chunks:
            self.add_chunk(chunk)
        logging.info("Finished adding chunks")
    
    def save(self, index_name: str):
        """
        Save FAISS index and metadata
        """
        index_path = self.base_path / f"{index_name}.index"
        metadata_path = self.base_path / f"{index_name}_metadata.pkl"
        
        logging.info(f"Saving FAISS index to {index_path}")
        faiss.write_index(self.index, str(index_path))
        
        logging.info(f"Saving metadata to {metadata_path}")
        with open(metadata_path, 'wb') as f:
            pickle.dump(self.chunk_metadata, f)
            
        # Also save vectorizer
        vectorizer_path = self.base_path / f"{index_name}_vectorizer.pkl"
        logging.info(f"Saving vectorizer to {vectorizer_path}")
        with open(vectorizer_path, 'wb') as f:
            pickle.dump(self.vectorizer, f)
    
    def load(self, index_name: str):
        """
        Load FAISS index and metadata
        """
        index_path = self.base_path / f"{index_name}.index"
        metadata_path = self.base_path / f"{index_name}_metadata.pkl"
        
        logging.info(f"Loading FAISS index from {index_path}")
        self.index = faiss.read_index(str(index_path))
        
        logging.info(f"Loading metadata from {metadata_path}")
        with open(metadata_path, 'rb') as f:
            self.chunk_metadata = pickle.load(f)
            self.current_id = len(self.chunk_metadata)
            
        # Also load vectorizer
        vectorizer_path = self.base_path / f"{index_name}_vectorizer.pkl"
        logging.info(f"Loading vectorizer from {vectorizer_path}")
        with open(vectorizer_path, 'rb') as f:
            self.vectorizer = pickle.load(f)

    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar chunks
        """
        query_embedding = self.model.encode([query])[0]
        distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                result = self.chunk_metadata[idx].copy()
                result['distance'] = float(distances[0][i])
                results.append(result)
        
        return results

  from tqdm.autonotebook import tqdm, trange


In [9]:
# Import necessary libraries
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Import required libraries
from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, Any, Tuple, Optional
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoTokenizer, AutoModel
import os
import faiss
import json
from sentence_transformers import SentenceTransformer
import pandas as pd

class BaseRetriever(ABC):
    def __init__(self, embedding_manager):
        self.manager = embedding_manager
        self.model = self.manager.model
        self.name = "base"
        self.hierarchical_stats = {
            "queries_processed": 0,
            "hierarchical_improvements": 0
        }
    
    @abstractmethod
    def retrieve(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        pass
    
    def get_stats(self) -> Dict[str, float]:
        """Return retriever stats (to be implemented by retrievers that track metrics)"""
        return {}
    
    def get_hierarchical_stats(self) -> Dict[str, Any]:
        """Return hierarchical exploration statistics"""
        stats = self.hierarchical_stats.copy()
        
        # Calculate improvement rate
        if stats["queries_processed"] > 0:
            stats["improvement_rate"] = stats["hierarchical_improvements"] / stats["queries_processed"]
        else:
            stats["improvement_rate"] = 0.0
            
        return stats
    
    def _apply_hierarchical_exploration(self, results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
        """Apply hierarchical exploration to results"""
        # Track query
        self.hierarchical_stats["queries_processed"] += 1
        
        # Check if there's hierarchical information
        improved = False
        for result in results:
            # Example: Check if this result references a parent or child document
            # that would be valuable to include
            if 'parent_id' in result or 'children_ids' in result:
                result['hierarchical_boost'] = True
                improved = True
            else:
                result['hierarchical_boost'] = False
        
        # Update stats
        if improved:
            self.hierarchical_stats["hierarchical_improvements"] += 1
            
        return results

class ContentRetriever(BaseRetriever):
    """Retriever that uses only content embeddings"""
    
    def __init__(self, embedding_manager):
        super().__init__(embedding_manager)
        self.name = "content"
        self.avg_similarity = 0
        self.query_count = 0
    
    def retrieve(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        start_time = time.time()
        
        # Get query embedding
        query_embedding = self.model.encode([query])[0]
        
        # Search in FAISS index
        distances, indices = self.manager.index.search(query_embedding.reshape(1, -1), k)
        
        # Get results
        results = []
        total_similarity = 0
        
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                result = self.manager.chunk_metadata[str(idx)].copy()
                distance = float(distances[0][i])
                similarity = 1 - distance
                
                # Store scores in a separate dictionary for clarity
                result['scores'] = {
                    'similarity': similarity,
                    'distance': distance
                }
                
                total_similarity += similarity
                results.append(result)
        
        # Update statistics
        if results:
            avg_similarity = total_similarity / len(results)
            self.avg_similarity = ((self.avg_similarity * self.query_count) + avg_similarity) / (self.query_count + 1)
            self.query_count += 1
        
        # Apply hierarchical exploration
        results = self._apply_hierarchical_exploration(results, query)
        
        elapsed = time.time() - start_time
        logging.info(f"Content retrieval took {elapsed:.2f} seconds")
        return results
    
    def get_stats(self) -> Dict[str, float]:
        return {
            "avg_similarity": self.avg_similarity,
            "query_count": self.query_count
        }

class CombinedRetriever(BaseRetriever):
    """Retriever that combines content and metadata with weighted scoring"""
    
    def __init__(self, embedding_manager):
        super().__init__(embedding_manager)
        self.name = "combined"
        self.nlp = spacy.load("en_core_web_sm")
        self.metadata_vectorizer = TfidfVectorizer(
            max_features=100,
            stop_words='english',
            min_df=1
        )
        self._prepare_metadata_vectors()
        self.avg_similarity = 0
        self.avg_metadata_score = 0
        self.query_count = 0
    
    def _prepare_metadata_vectors(self):
        """Prepare metadata vectors for all chunks"""
        metadata_texts = []
        for chunk_id, chunk_data in self.manager.chunk_metadata.items():
            metadata_text = ' '.join(chunk_data.get('topics', []))
            if 'entities' in chunk_data:
                metadata_text += ' ' + ' '.join([e['text'] for e in chunk_data['entities']])
            metadata_texts.append(metadata_text)
        
        self.metadata_vectors = self.metadata_vectorizer.fit_transform(metadata_texts)
    
    def retrieve(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        start_time = time.time()
        
        # Get content-based results
        query_embedding = self.model.encode([query])[0]
        distances, indices = self.manager.index.search(query_embedding.reshape(1, -1), k * 2)
        
        # Extract entities from query
        doc = self.nlp(query)
        query_entities = [ent.text for ent in doc.ents]
        
        # Combine query with entities
        enhanced_query = query + ' ' + ' '.join(query_entities)
        
        # Get query vector
        query_vector = self.metadata_vectorizer.transform([enhanced_query])
        
        # Calculate metadata similarities
        metadata_similarities = (query_vector @ self.metadata_vectors.T).toarray()[0]
        
        # Combine scores
        combined_scores = {}
        
        # Add content-based scores (70% weight)
        max_distance = max(distances[0]) + 1e-6
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                combined_scores[str(idx)] = {
                    'combined': 0.7 * (1 - (distances[0][i] / max_distance)),
                    'content': 1 - (distances[0][i] / max_distance),
                    'metadata': 0.0  # Will be updated if metadata match exists
                }
        
        # Add metadata-based scores (30% weight)
        for idx in range(len(metadata_similarities)):
            metadata_score = metadata_similarities[idx]
            str_idx = str(idx)
            if str_idx in combined_scores:
                combined_scores[str_idx]['metadata'] = metadata_score
                combined_scores[str_idx]['combined'] += 0.3 * metadata_score
            else:
                combined_scores[str_idx] = {
                    'combined': 0.3 * metadata_score,
                    'content': 0.0,  # No content match
                    'metadata': metadata_score
                }
        
        # Sort by combined score and get top k
        top_indices = sorted(combined_scores.items(), key=lambda x: x[1]['combined'], reverse=True)[:k]
        
        # Prepare results
        results = []
        total_combined = 0
        total_metadata = 0
        
        for idx, scores in top_indices:
            result = self.manager.chunk_metadata[idx].copy()
            result['scores'] = scores.copy()
            
            total_combined += scores['combined']
            total_metadata += scores['metadata']
            results.append(result)
        
        # Update statistics
        if results:
            avg_combined = total_combined / len(results)
            avg_metadata = total_metadata / len(results)
            
            self.avg_similarity = ((self.avg_similarity * self.query_count) + avg_combined) / (self.query_count + 1)
            self.avg_metadata_score = ((self.avg_metadata_score * self.query_count) + avg_metadata) / (self.query_count + 1)
            self.query_count += 1
        
        # Apply hierarchical exploration
        results = self._apply_hierarchical_exploration(results, query)
        
        elapsed = time.time() - start_time
        logging.info(f"Combined retrieval took {elapsed:.2f} seconds")
        return results
    
    def get_stats(self) -> Dict[str, float]:
        return {
            "avg_similarity": self.avg_similarity,
            "avg_metadata_score": self.avg_metadata_score,
            "query_count": self.query_count
        }

class HybridRetriever(BaseRetriever):
    """Retriever that applies reranking to combined results"""
    
    def __init__(self, embedding_manager, reranker_model=None):
        super().__init__(embedding_manager)
        self.name = "hybrid"
        self.base_retriever = CombinedRetriever(embedding_manager)
        self.reranker = reranker_model if reranker_model else RerankerModel()
        self.avg_reranker_score = 0
        self.query_count = 0
        self.candidate_pool_size = 100  # Number of candidates to get from base retriever
    
    def retrieve(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        start_time = time.time()
        
        # Determine how many documents to return after reranking
        if k <= 3:
            top_k = 2
        elif k <= 5:
            top_k = 3
        else:
            top_k = 4
        
        # Get a larger candidate pool from base retriever
        candidates = self.base_retriever.retrieve(query, k=self.candidate_pool_size)
        
        # Rerank results
        reranked_results = self.reranker.rerank(query, candidates, top_k)
        
        # Update statistics
        if reranked_results:
            total_score = sum([doc['scores'].get('reranker', 0) for doc in reranked_results])
            avg_score = total_score / len(reranked_results)
            self.avg_reranker_score = ((self.avg_reranker_score * self.query_count) + avg_score) / (self.query_count + 1)
            self.query_count += 1
        
        # Apply hierarchical exploration
        reranked_results = self._apply_hierarchical_exploration(reranked_results, query)
        
        elapsed = time.time() - start_time
        logging.info(f"Hybrid reranked retrieval took {elapsed:.2f} seconds")
        return reranked_results
    
    def get_stats(self) -> Dict[str, float]:
        base_stats = self.base_retriever.get_stats()
        return {
            **base_stats,
            "avg_reranker_score": self.avg_reranker_score,
            "query_count": self.query_count
        }

class RerankerModel:
    """Wrapper for a cross-encoder reranker model"""
    
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
        
        # Move model to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
    
    def rerank(self, query: str, documents: List[Dict[str, Any]], top_k: Optional[int] = None) -> List[Dict[str, Any]]:
        """Rerank documents based on relevance to query"""
        if not documents:
            return []
        
        # Prepare document contents
        doc_contents = [doc.get('content', '') for doc in documents]
        
        # Create input pairs
        pairs = []
        for doc_content in doc_contents:
            pairs.append([query, doc_content])
        
        # Tokenize
        with torch.no_grad():
            # Process in batches to avoid OOM
            batch_size = 8
            all_scores = []
            
            for i in range(0, len(pairs), batch_size):
                batch_pairs = pairs[i:i+batch_size]
                features = self.tokenizer(batch_pairs, padding=True, truncation=True, 
                                          return_tensors="pt", max_length=512)
                features = {k: v.to(self.device) for k, v in features.items()}
                
                # Get embeddings
                outputs = self.model(**features)
                embeddings = outputs.last_hidden_state[:, 0, :]  # Use CLS token
                
                # Calculate scores
                scores = torch.nn.functional.cosine_similarity(
                    embeddings[:, None, :], embeddings[None, :, :], dim=-1
                ).cpu().numpy()
                
                # Get diagonal scores (similarity between query and document)
                for j in range(len(batch_pairs)):
                    all_scores.append(float(scores[j, j]))
        
        # Add scores to documents
        for i, doc in enumerate(documents):
            if 'scores' not in doc:
                doc['scores'] = {}
            doc['scores']['reranker'] = all_scores[i]
            # Update combined score with reranker (give it high weight)
            if 'combined' in doc['scores']:
                doc['scores']['final'] = 0.3 * doc['scores']['combined'] + 0.7 * all_scores[i]
            else:
                doc['scores']['final'] = all_scores[i]
        
        # Sort by reranker score
        reranked_docs = sorted(documents, key=lambda x: x['scores'].get('final', 0), reverse=True)
        
        # Return top-k if specified
        if top_k and top_k < len(reranked_docs):
            return reranked_docs[:top_k]
        
        return reranked_docs

class EmbeddingManager:
    """Manager for embeddings and document metadata"""
    
    def __init__(self, model_name="all-MiniLM-L6-v2", base_path=None):
        # Load embedding model
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()
        
        # Initialize empty index and metadata
        self.index = None
        self.chunk_metadata = {}
        
        # Load from base_path if provided
        if base_path:
            self.load(base_path)
    
    def load(self, base_path):
        """Load index and metadata from base path"""
        index_path = os.path.join(base_path, "s3.index")
        metadata_path = os.path.join(base_path, "s3_metadata.pkl")
        
        # Load FAISS index
        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)
        else:
            logging.warning(f"Index file not found at {index_path}, initializing empty index")
            self.index = faiss.IndexFlatL2(self.dimension)
        
        # Load metadata
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                self.chunk_metadata = json.load(f)
        else:
            logging.warning(f"Metadata file not found at {metadata_path}, initializing empty metadata")
            self.chunk_metadata = {}

class RetrieverFactory:
    """Factory class to create retrievers"""
    
    @staticmethod
    def create_retrievers(model_name="all-MiniLM-L6-v2", reranker_model="cross-encoder/ms-marco-MiniLM-L-6-v2", 
                         candidate_pool_size=100, base_path=None):
        """Create all retrievers"""
        # Create embedding manager
        manager = EmbeddingManager(model_name, base_path)
        
        # Create individual retrievers
        content = ContentRetriever(manager)
        combined = CombinedRetriever(manager)
        
        # Create reranker model
        reranker = RerankerModel(reranker_model)
        
        # Create hybrid retriever
        hybrid = HybridRetriever(manager, reranker)
        hybrid.candidate_pool_size = candidate_pool_size
        
        return {
            'content': content,
            'combined': combined,
            'hybrid': hybrid
        }


In [10]:

# Create all three retrievers
retrievers = RetrieverFactory.create_retrievers(
    model_name='all-MiniLM-L6-v2',
    reranker_model='cross-encoder/ms-marco-MiniLM-L-6-v2',
    candidate_pool_size=60,
    base_path='FSAII'
)

# Example query
query = "What are the prerequisites for locking a vault in S3 Glacier, and how does this feature enforce compliance?"

# Function to display results with detailed scores
def display_results(retriever_name, results):
    print(f"\n\n=== {retriever_name.upper()} RETRIEVER RESULTS ===")
    print(f"Top {len(results)} results:")
    
    for i, result in enumerate(results):
        print(f"\n{i+1}. {result.get('id', result.get('chunk_id', 'unknown'))}")
        
        # Print scores
        if 'scores' in result:
            print("   Scores:")
            for score_type, score_value in result['scores'].items():
                print(f"     {score_type}: {score_value:.4f}" if isinstance(score_value, float) 
                      else f"     {score_type}: {score_value}")
        
        # Print hierarchical information
        print(f"   Hierarchical boost: {result.get('hierarchical_boost', False)}")
        
        # Print content snippet
        content = result.get('content', '')
        if content:
            print(f"   Content snippet: {content[:100]}...")


# Try content retriever
content_results = retrievers['content'].retrieve(query, k=5)
display_results('Content', content_results)

# Try combined retriever
combined_results = retrievers['combined'].retrieve(query, k=5)
display_results('Combined', combined_results)

# Try hybrid retriever
hybrid_results = retrievers['hybrid'].retrieve(query, k=5)
display_results('Hybrid', hybrid_results)

# Compare hierarchical improvement stats
print("\n\n=== HIERARCHICAL EXPLORATION STATS ===")
for name, retriever in retrievers.items():
    stats = retriever.get_hierarchical_stats()
    print(f"\n{name.upper()} RETRIEVER:")
    print(f"  Queries processed: {stats['queries_processed']}")
    print(f"  Improvements: {stats['hierarchical_improvements']}")
    print(f"  Improvement rate: {stats['improvement_rate']:.2%}")

2025-03-03 15:50:31,691 - INFO - Use pytorch device_name: cpu
2025-03-03 15:50:31,691 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


ValueError: empty vocabulary; perhaps the documents only contain stop words