# **Implementing a Semantic Search Engine with FAISS for Legal Documents**

### **Setup and Import Dependencies**

In [1]:
# Import libraries
import os
import numpy as np
import pandas as pd
import faiss
import torch
import docx
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import pickle

### **Load and Prepare Documents**


The process begins with document ingestion and preparation:

- **Loading Documents**: The system loads DOCX files containing various legal documents
- **Text Extraction**: Text is extracted from each document using the docx library
- **Text Preprocessing**: Raw text is cleaned by removing extra whitespace
- **Document Organization**: Documents are categorized (Employment Contracts, Privacy Policies, etc.)

In [2]:
# Function to load document data (assuming you have the documents DataFrame from previous work)
def load_document_data(data_path='./model_output/documents.pkl'):
    """Load the documents DataFrame saved from previous steps."""
    try:
        return pd.read_pickle(data_path)
    except FileNotFoundError:
        print(f"Could not find documents file at {data_path}")
        return None

# Function to extract text from DOCX files
def extract_text_from_docx(file_path):
    """Extract text content from a .docx file."""
    try:
        doc = docx.Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            if para.text.strip():  # Skip empty paragraphs
                full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return ""

# Function to preprocess text
def preprocess_text(text):
    """Clean and preprocess text."""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase (optional for legal documents, as case might be significant)
    # text = text.lower()
    return text

# Load documents DataFrame
docs_df = load_document_data()

# If the DataFrame doesn't exist or we need to create a new one from raw files
if docs_df is None:
    # Define document categories (same as in your classifier)
    document_categories = {
        "Employment Contracts": ["General Employment Agreement", "Non-Compete Clause", 
                               "Confidentiality Agreement", "Termination Conditions", 
                               "Intellectual Property Rights"],
        "Privacy Policies": ["General Privacy Policy", "Data Handling and Retention Policy", 
                           "Cookie Policy", "Employee Data Protection Agreement", 
                           "Customer Data Consent Form"],
        "Corporate Governance": ["Bylaws Articles of Association", "Board of Directors Responsibilities", 
                               "Shareholders Agreement", "Conflict of Interest Policy", 
                               "Code of Ethics"],
        "Commercial Agreements": ["Vendor Agreement", "Sales Contract", 
                                "Non-Disclosure Agreement", "Service Level Agreement", 
                                "Partnership Agreement"],
        "Health and Safety": ["Workplace Safety Policy", "Emergency Response Plan", 
                            "Employee Health and Safety Acknowledgment Form", "Accident Reporting Procedure", 
                            "Hazardous Materials Handling Policy"]
    }
    
    # Create documents DataFrame
    documents = []
    doc_id = 0
    data_dir = '../data'  # Change to your documents directory
    
    for category, doc_names in document_categories.items():
        for doc_name in doc_names:
            # Construct filename
            safe_name = doc_name.replace(' ', '_')
            file_path = os.path.join(data_dir, f"{safe_name}.docx")
            
            # Skip if file doesn't exist
            if not os.path.exists(file_path):
                print(f"Warning: File not found: {file_path}")
                continue
                
            # Extract and preprocess text
            text = extract_text_from_docx(file_path)
            text = preprocess_text(text)
            
            # Add document info to list
            documents.append({
                'id': doc_id,
                'name': doc_name,
                'category': category,
                'text': text,
                'file_path': file_path
            })
            doc_id += 1
    
    docs_df = pd.DataFrame(documents)
    print(f"Created new documents DataFrame with {len(docs_df)} documents")
else:
    print(f"Loaded existing documents DataFrame with {len(docs_df)} documents")

# Display sample documents
docs_df[['id', 'name', 'category']].head()

Could not find documents file at ./model_output/documents.pkl
Created new documents DataFrame with 25 documents


Unnamed: 0,id,name,category
0,0,General Employment Agreement,Employment Contracts
1,1,Non-Compete Clause,Employment Contracts
2,2,Confidentiality Agreement,Employment Contracts
3,3,Termination Conditions,Employment Contracts
4,4,Intellectual Property Rights,Employment Contracts


### **Create Chunked Document Segments for Better Search**

Long documents are split into manageable chunks:

- Documents are divided into smaller, overlapping segments (~200 words each)
- Each chunk maintains metadata linking back to its source document
- This chunking approach increases search precision by focusing on relevant sections

In [3]:
def chunk_documents(docs_df, chunk_size=200, overlap=50):
    """
    Split documents into smaller overlapping chunks for more precise search results.
    
    Args:
        docs_df: DataFrame with document data
        chunk_size: Approximate number of words per chunk
        overlap: Number of words overlap between chunks
    
    Returns:
        DataFrame with document chunks
    """
    chunks = []
    chunk_id = 0
    
    for _, doc in tqdm(docs_df.iterrows(), total=len(docs_df), desc="Chunking documents"):
        # Split document text into words
        words = doc['text'].split()
        
        # Skip if document is too short
        if len(words) < 20:  # Arbitrary minimum size
            chunks.append({
                'chunk_id': chunk_id,
                'doc_id': doc['id'],
                'doc_name': doc['name'],
                'category': doc['category'],
                'text': doc['text'],
                'start_idx': 0,
                'end_idx': len(words)
            })
            chunk_id += 1
            continue
        
        # Create overlapping chunks
        for i in range(0, len(words), chunk_size - overlap):
            # Get chunk words
            chunk_words = words[i:i + chunk_size]
            
            # Skip if chunk is too small (last chunk might be)
            if len(chunk_words) < min(20, chunk_size // 4):
                continue
                
            # Join words back into text
            chunk_text = ' '.join(chunk_words)
            
            # Add chunk to list
            chunks.append({
                'chunk_id': chunk_id,
                'doc_id': doc['id'],
                'doc_name': doc['name'],
                'category': doc['category'],
                'text': chunk_text,
                'start_idx': i,
                'end_idx': i + len(chunk_words)
            })
            chunk_id += 1
    
    # Create DataFrame of chunks
    chunks_df = pd.DataFrame(chunks)
    print(f"Created {len(chunks_df)} chunks from {len(docs_df)} documents")
    return chunks_df

# Create document chunks
chunks_df = chunk_documents(docs_df)

# Display sample chunks
chunks_df[['chunk_id', 'doc_id', 'doc_name', 'category']].head()

Chunking documents: 100%|██████████| 25/25 [00:00<00:00, 3672.13it/s]

Created 56 chunks from 25 documents





Unnamed: 0,chunk_id,doc_id,doc_name,category
0,0,0,General Employment Agreement,Employment Contracts
1,1,0,General Employment Agreement,Employment Contracts
2,2,1,Non-Compete Clause,Employment Contracts
3,3,1,Non-Compete Clause,Employment Contracts
4,4,2,Confidentiality Agreement,Employment Contracts


### **Generate Embeddings for Document Chunks**

The system converts text into vector representations:

- **Model Selection**: Uses the Sentence Transformers model (`all-MiniLM-L6-v2`)
- **Vectorization**: Each text chunk is transformed into a dense embedding vector
- These embeddings capture semantic meaning, enabling similarity-based search

In [4]:
def generate_embeddings(chunks_df, model_name='all-MiniLM-L6-v2'):
    """
    Generate embeddings for all document chunks using Sentence Transformers.
    
    Args:
        chunks_df: DataFrame with document chunks
        model_name: Name of the sentence-transformers model to use
    
    Returns:
        Tuple of (chunks_df with embeddings column, embeddings matrix)
    """
    # Load sentence transformer model
    print(f"Loading model: {model_name}")
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for all chunks
    print("Generating embeddings...")
    texts = chunks_df['text'].tolist()
    embeddings = model.encode(texts, show_progress_bar=True)
    
    # Convert to float32 for FAISS
    embeddings = embeddings.astype(np.float32)
    
    # Add embeddings to DataFrame
    chunks_df['embedding'] = list(embeddings)
    
    print(f"Generated embeddings with shape: {embeddings.shape}")
    return chunks_df, embeddings

# Generate embeddings for document chunks
chunks_df, embeddings_matrix = generate_embeddings(chunks_df)

# Check embedding dimensions
print(f"Embedding dimension: {len(chunks_df['embedding'].iloc[0])}")

Loading model: all-MiniLM-L6-v2
Generating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (56, 384)
Embedding dimension: 384


### **Build and Configure FAISS Index**

FAISS (Facebook AI Similarity Search) provides efficient similarity search:

- The embedding vectors are normalized and added to a FAISS index
- Different index types are available `(flat, IVF, HNSW)` based on dataset size
- The index enables fast vector similarity searches

In [6]:
def build_faiss_index(embeddings, index_type='flat'):
    """
    Build a FAISS index for fast similarity search.
    
    Args:
        embeddings: Matrix of document embeddings
        index_type: Type of FAISS index to use ('flat', 'ivf', or 'hnsw')
    
    Returns:
        FAISS index
    """
    # Get embedding dimension
    d = embeddings.shape[1]
    
    # Normalize embeddings for cosine similarity
    normalized_embeddings = normalize(embeddings, axis=1, norm='l2')
    
    if index_type == 'flat':
        # Flat index - exact search, most accurate but slowest for large datasets
        index = faiss.IndexFlatIP(d)  # Inner product for cosine similarity with normalized vectors
    
    elif index_type == 'ivf':
        # IVF index - approximate search, faster than flat
        nlist = min(64, int(embeddings.shape[0] / 10))  # Number of clusters
        quantizer = faiss.IndexFlatIP(d)
        index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
        # Need to train IVF index
        index.train(normalized_embeddings)
    
    elif index_type == 'hnsw':
        # HNSW index - approximate search, very fast and memory efficient
        index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT)  # 32 connections per node
    
    else:
        raise ValueError(f"Unknown index type: {index_type}")
    
    # Add vectors to index
    index.add(normalized_embeddings)
    
    print(f"Built {index_type} FAISS index with {index.ntotal} vectors")
    return index

# Build FAISS index
# For small datasets (<10K documents), 'flat' is best
# For larger datasets, consider 'ivf' or 'hnsw'
index_type = 'flat'  # Choose based on your dataset size
faiss_index = build_faiss_index(embeddings_matrix, index_type)

Built flat FAISS index with 56 vectors


### **Create Search Functions**

The search process involves:

- Converting a query into the same vector space using the same embedding model
- Finding the most similar document chunks in the FAISS index
- Deduplicating results to show only the most relevant section from each document
- Displaying results with relevance scores and text snippets

In [7]:
def search_documents(query, index, chunks_df, model, top_k=5, threshold=0.6):
    """
    Search for documents similar to the query.
    
    Args:
        query: Search query text
        index: FAISS index
        chunks_df: DataFrame with document chunks and metadata
        model: SentenceTransformer model for encoding the query
        top_k: Number of results to return
        threshold: Minimum similarity score threshold
    
    Returns:
        DataFrame with search results
    """
    # Encode query
    query_embedding = model.encode([query])[0].astype(np.float32)
    
    # Normalize query vector for cosine similarity
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    # Search index
    distances, indices = index.search(np.array([query_embedding]), top_k)
    
    # Extract results
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        if idx != -1 and dist >= threshold:  # Skip invalid results and low scores
            chunk = chunks_df.iloc[idx]
            results.append({
                'rank': i + 1,
                'score': float(dist),
                'chunk_id': int(chunk['chunk_id']),
                'doc_id': int(chunk['doc_id']),
                'doc_name': chunk['doc_name'],
                'category': chunk['category'],
                'text_snippet': chunk['text'][:300] + '...' if len(chunk['text']) > 300 else chunk['text']
            })
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

def search_with_document_frequency(query, index, chunks_df, model, top_k=5, threshold=0.6, initial_results=20):
    """
    Enhanced search function that accounts for document frequency in search results.
    Documents with multiple high-scoring chunks get boosted in ranking.
    
    Args:
        query: Search query text
        index: FAISS index
        chunks_df: DataFrame with document chunks and metadata
        model: SentenceTransformer model
        top_k: Number of final results to return
        threshold: Minimum similarity score threshold
        initial_results: Number of initial chunks to retrieve
        
    Returns:
        DataFrame with search results, with documents having multiple hits prioritized
    """
    # Get more initial results to account for document frequency
    results = search_documents(query, index, chunks_df, model, top_k=initial_results, threshold=threshold)
    
    if len(results) == 0:
        return pd.DataFrame()
    
    # Calculate document frequency and aggregate scores
    doc_scores = {}
    for doc_id in results['doc_id'].unique():
        # Get all chunks from this document
        doc_chunks = results[results['doc_id'] == doc_id]
        
        # Calculate aggregate score using both score and frequency
        # Log frequency prevents documents with many low-relevance chunks from dominating
        frequency_boost = 1 + np.log1p(len(doc_chunks))
        avg_score = doc_chunks['score'].mean()
        max_score = doc_chunks['score'].max()
        
        # Weighted score - emphasizes both high max score and multiple occurrences
        doc_scores[doc_id] = (0.7 * max_score + 0.3 * avg_score) * frequency_boost
    
    # Rank documents by their overall score
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    
    # Prepare final results - include at least one chunk from each top document
    final_results = []
    used_chunk_ids = set()
    
    # First, add the best chunk from each top document
    for doc_id, doc_score in ranked_docs:
        doc_chunks = results[results['doc_id'] == doc_id].sort_values('score', ascending=False)
        if len(doc_chunks) > 0:
            best_chunk = doc_chunks.iloc[0]
            final_results.append({
                'doc_id': int(best_chunk['doc_id']),
                'chunk_id': int(best_chunk['chunk_id']),
                'doc_name': best_chunk['doc_name'],
                'category': best_chunk['category'],
                'text_snippet': best_chunk['text_snippet'],
                'score': float(best_chunk['score']),
                'doc_frequency': len(doc_chunks),
                'aggregate_score': doc_score,
                'rank': 0  # Will be updated later
            })
            used_chunk_ids.add(best_chunk['chunk_id'])
    
    # Create final DataFrame
    final_df = pd.DataFrame(final_results)
    final_df['rank'] = range(1, len(final_df) + 1)
    
    return final_df

def search_and_deduplicate(query, index, chunks_df, model, top_k=5, threshold=0.6):
    """
    Search for documents and deduplicate by document ID.
    Returns only the highest-scoring chunk for each document.
    """
    # Get raw search results
    results = search_with_document_frequency(query, index, chunks_df, model, top_k=top_k*2, threshold=threshold)
    
    if len(results) == 0:
        return results
    
    # Deduplicate by doc_id, keeping highest score
    results = results.sort_values('score', ascending=False)
    deduped_results = results.drop_duplicates(subset=['doc_id']).head(top_k)
    
    # Reset rank
    deduped_results['rank'] = range(1, len(deduped_results) + 1)
    
    return deduped_results

# Load the model used for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Test search function with a sample query
sample_query = "What are the terms of confidentiality in agreements?"
search_results = search_and_deduplicate(sample_query, faiss_index, chunks_df, model)

# Display search results
if len(search_results) > 0:
    print(f"Found {len(search_results)} results for query: '{sample_query}'")
    display(search_results[['rank', 'score', 'doc_name', 'category']])
else:
    print(f"No results found for query: '{sample_query}'")

Found 2 results for query: 'What are the terms of confidentiality in agreements?'


Unnamed: 0,rank,score,doc_name,category
0,1,0.637336,Non-Disclosure Agreement,Commercial Agreements
1,2,0.634343,Confidentiality Agreement,Employment Contracts


### **Create a Search Evaluation Function**

In [8]:
def evaluate_search(test_queries, index, chunks_df, model, ground_truth=None):
    """
    Evaluate search performance on a set of test queries.
    
    Args:
        test_queries: List of query strings or dict mapping queries to expected doc_ids
        index: FAISS index
        chunks_df: DataFrame with document chunks
        model: SentenceTransformer model
        ground_truth: Optional dict mapping queries to expected doc_ids
    
    Returns:
        Dict with evaluation metrics
    """
    results = {}
    
    # Convert simple list to dict if needed
    if isinstance(test_queries, list) and ground_truth is None:
        test_queries = {q: None for q in test_queries}
    elif ground_truth is not None:
        test_queries = ground_truth
    
    # Run searches
    for query, expected_docs in test_queries.items():
        search_result = search_and_deduplicate(query, index, chunks_df, model, top_k=5)
        
        if len(search_result) > 0:
            results[query] = {
                'found': len(search_result),
                'top_doc': search_result.iloc[0]['doc_name'],
                'top_category': search_result.iloc[0]['category'],
                'top_score': search_result.iloc[0]['score'],
                'found_docs': search_result['doc_name'].tolist(),
                'expected_found': None
            }
            
            # Check if expected documents were found
            if expected_docs is not None:
                expected_found = [doc in search_result['doc_id'].tolist() for doc in expected_docs]
                results[query]['expected_found'] = sum(expected_found) / len(expected_docs)
        else:
            results[query] = {
                'found': 0,
                'top_doc': None,
                'top_category': None,
                'top_score': None,
                'found_docs': [],
                'expected_found': 0 if expected_docs else None
            }
    
    # Calculate overall metrics
    metrics = {
        'total_queries': len(test_queries),
        'queries_with_results': sum(1 for r in results.values() if r['found'] > 0),
        'avg_results_per_query': np.mean([r['found'] for r in results.values()]),
        'avg_top_score': np.mean([r['top_score'] for r in results.values() if r['top_score'] is not None]),
    }
    
    if all(r['expected_found'] is not None for r in results.values()):
        metrics['avg_recall'] = np.mean([r['expected_found'] for r in results.values()])
    
    return results, metrics

# Define some test queries
test_queries = [
    "Who owns the intellectual property created by employees?",
    "What is the company's cookie policy?",
    "How should workplace accidents be reported?",
    "What are the board of directors' responsibilities?"
    
]

# Evaluate search performance
query_results, metrics = evaluate_search(test_queries, faiss_index, chunks_df, model)

# Display evaluation metrics
print("Search Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

# Display individual query results
for query, result in query_results.items():
    print(f"\nQuery: {query}")
    print(f"Top document: {result['top_doc']} ({result['top_category']})")
    print(f"Score: {result['top_score']:.4f}")
    print(f"Found documents: {result['found_docs']}")

Search Evaluation Metrics:
total_queries: 4
queries_with_results: 4
avg_results_per_query: 1.5
avg_top_score: 0.6651738286018372

Query: Who owns the intellectual property created by employees?
Top document: Intellectual Property Rights (Employment Contracts)
Score: 0.6086
Found documents: ['Intellectual Property Rights']

Query: What is the company's cookie policy?
Top document: Cookie Policy (Privacy Policies)
Score: 0.6956
Found documents: ['Cookie Policy']

Query: How should workplace accidents be reported?
Top document: Accident Reporting Procedure (Health and Safety)
Score: 0.6160
Found documents: ['Accident Reporting Procedure', 'Workplace Safety Policy']

Query: What are the board of directors' responsibilities?
Top document: Board of Directors Responsibilities (Corporate Governance)
Score: 0.7405
Found documents: ['Board of Directors Responsibilities', 'Bylaws Articles of Association']


### **Create a Search Reranking Function**

In [9]:
def rerank_search_results(query, results_df, chunks_df, model, top_k=5):
    """
    Rerank search results using more sophisticated techniques.
    This is useful for improving precision when basic vector similarity isn't enough.
    
    Args:
        query: Original search query
        results_df: Initial search results
        chunks_df: DataFrame with document chunks
        model: SentenceTransformer model
        top_k: Number of results to return after reranking
        
    Returns:
        DataFrame with reranked results
    """
    if len(results_df) == 0:
        return results_df
    
    # Get document chunks for all results
    chunks = []
    for _, row in results_df.iterrows():
        chunk_id = row['chunk_id']
        chunk_text = chunks_df.loc[chunks_df['chunk_id'] == chunk_id, 'text'].iloc[0]
        chunks.append(chunk_text)
    
    # Use cross-encoder for reranking if available
    try:
        from sentence_transformers import CrossEncoder
        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
        
        # Create query-document pairs
        pairs = [[query, chunk] for chunk in chunks]
        
        # Score pairs
        scores = cross_encoder.predict(pairs)
        
        # Add new scores to results
        results_df['rerank_score'] = scores
        
        # Sort by new scores
        reranked_df = results_df.sort_values('rerank_score', ascending=False).head(top_k)
        
        # Reset rank
        reranked_df['rank'] = range(1, len(reranked_df) + 1)
        
        return reranked_df
    
    except ImportError:
        print("CrossEncoder not available, skipping reranking")
        return results_df.head(top_k)

# This is optional - only run if you have the cross-encoder package installed
# Try reranking results for a sample query
try:
    sample_query = "What are the confidentiality terms for vendors?"
    initial_results = search_documents(sample_query, faiss_index, chunks_df, model, top_k=10)
    
    if len(initial_results) > 0:
        reranked_results = rerank_search_results(sample_query, initial_results, chunks_df, model)
        
        print(f"Initial vs Reranked Results for '{sample_query}':")
        print("\nInitial Top 3:")
        display(initial_results[['rank', 'score', 'doc_name']].head(3))
        
        print("\nReranked Top 3:")
        display(reranked_results[['rank', 'rerank_score', 'doc_name']].head(3))
    else:
        print(f"No results found for query: '{sample_query}'")
except Exception as e:
    print(f"Skipping reranking example: {str(e)}")

Initial vs Reranked Results for 'What are the confidentiality terms for vendors?':

Initial Top 3:


Unnamed: 0,rank,score,doc_name
0,1,0.66088,Non-Disclosure Agreement



Reranked Top 3:


Unnamed: 0,rank,rerank_score,doc_name
0,1,-2.502943,Non-Disclosure Agreement


### **Create a Document Context Builder Function for RAG**

The Retrieval-Augmented Generation (RAG) chatbot:

- Takes a user question and converts it to an embedding
- Retrieves the most relevant document chunks from the index
- Builds a context from these retrieved chunks
- Passes this context along with the question to an LLM (GitHub's OpenAI-based service)
- The LLM generates an answer grounded in the retrieved documents

In [10]:
def build_improved_context(query, chunks_df, index, model, max_chunks=8, max_tokens=2000):
    """
    Build a context from relevant document chunks for RAG applications.
    This improved version:
    1. Retrieves more initial chunks
    2. Considers both individual chunk relevance and document frequency
    3. Includes multiple chunks from the same document if they're relevant
    
    Args:
        query: User query
        chunks_df: DataFrame with document chunks
        index: FAISS index
        model: SentenceTransformer model
        max_chunks: Maximum number of chunks to include
        max_tokens: Approximate maximum number of tokens in context
        
    Returns:
        Tuple of (context string, list of source documents)
    """
    # Get a larger initial set of chunks (3x the final desired amount)
    # Use search_documents directly to avoid document-level deduplication
    initial_results = search_documents(query, index, chunks_df, model, top_k=max_chunks*3, threshold=0.5)
    
    if len(initial_results) == 0:
        return "No relevant documents found.", []
    
    # Calculate document frequency
    doc_counts = initial_results['doc_id'].value_counts()
    
    # Add document frequency information to results
    initial_results['doc_frequency'] = initial_results['doc_id'].map(doc_counts)
    
    # Calculate a combined relevance score that considers:
    # 1. Original semantic similarity score (70%)
    # 2. Document frequency boost (30%)
    initial_results['combined_score'] = (
        0.7 * initial_results['score'] + 
        0.3 * np.log1p(initial_results['doc_frequency']) * initial_results['score']
    )
    
    # Sort by combined score to get the most relevant chunks
    # regardless of which document they come from
    ranked_chunks = initial_results.sort_values('combined_score', ascending=False)
    
    # Build context from top chunks
    context_parts = []
    source_docs = set()  # Track unique source documents
    token_count = 0
    word_to_token_ratio = 0.75
    
    # Take chunks based on combined score until we hit our limits
    for _, row in ranked_chunks.iterrows():
        if len(context_parts) >= max_chunks:
            break
            
        chunk_id = row['chunk_id']
        doc_id = row['doc_id']
        doc_name = row['doc_name']
        category = row['category']
        
        # Get full chunk text
        chunk_text = chunks_df.loc[chunks_df['chunk_id'] == chunk_id, 'text'].iloc[0]
        
        # Estimate token count
        words = len(chunk_text.split())
        estimated_tokens = int(words * word_to_token_ratio)
        
        # Skip if adding this would exceed token limit
        if token_count + estimated_tokens > max_tokens:
            continue
            
        # Add chunk to context
        context_parts.append(f"Document: {doc_name} (Category: {category})\n{chunk_text}\n\n")
        token_count += estimated_tokens
        
        # Track source documents for attribution
        source_docs.add(doc_name)
    
    # Combine all parts
    full_context = "".join(context_parts)
    
    return full_context, list(source_docs)

# Test context builder with a sample query
sample_query = "What are my responsibilities regarding workplace safety?"
context , source_docs = build_improved_context(sample_query, chunks_df, faiss_index, model)

print(f"Context for query: '{sample_query}'")
print("-" * 80)
print(context[:1000] + "..." if len(context) > 1000 else context)
print("-" * 80)
print(f"Source documents: {source_docs}")


Context for query: 'What are my responsibilities regarding workplace safety?'
--------------------------------------------------------------------------------
Document: Workplace Safety Policy (Category: Health and Safety)
ABS Company is committed to providing a safe and healthy work environment for all employees in compliance with applicable occupational health and safety laws, including OSHA regulations (or relevant local standards). Our Workplace Safety Policy outlines the general safety standards, employee responsibilities, and reporting procedures to ensure risk is minimized and everyone’s well-being is safeguarded. General Safety Standards All employees must follow established safety procedures and wear appropriate personal protective equipment (PPE) while performing tasks. Work areas must be kept clean, organized, and free of hazards. Safety signs, emergency exits, and equipment must always be accessible and clearly marked. Machinery and electrical equipment must be operated onl

### **Create a Simple Search Interface Function**

In [11]:
def search_interface(query, chunks_df, index, model, show_snippets=True):
    """
    Simple search interface function for displaying search results.
    
    Args:
        query: Search query
        chunks_df: DataFrame with document chunks
        index: FAISS index
        model: SentenceTransformer model
        show_snippets: Whether to show text snippets
        
    Returns:
        DataFrame with search results
    """
    print(f"Searching for: '{query}'")
    print("-" * 80)
    
    # Search for documents
    results = search_and_deduplicate(query, index, chunks_df, model)
    
    if len(results) == 0:
        print("No results found.")
        return pd.DataFrame()
    
    print(f"Found {len(results)} relevant documents:")
    
    # Display results
    for _, row in results.iterrows():
        print(f"{row['rank']}. {row['doc_name']} ({row['category']}) - Score: {row['score']:.4f}")
        
        if show_snippets:
            snippet = row['text_snippet']
            # Highlight query terms (simple approach)
            for term in query.lower().split():
                if len(term) > 3:  # Only highlight meaningful terms
                    pattern = re.compile(re.escape(term), re.IGNORECASE)
                    snippet = pattern.sub(f"**{term.upper()}**", snippet)
            
            print(f"   Snippet: {snippet}")
            print()
    
    return results

# Test search interface with a few queries
test_queries = [
    "What are the intellectual property rights for employee created work?",
    "How do we handle cookie consent for website visitors?",
    "What steps should be taken after a workplace accident?",
]

for query in test_queries:
    results = search_interface(query, chunks_df, faiss_index, model)
    print("\n" + "=" * 80 + "\n")

Searching for: 'What are the intellectual property rights for employee created work?'
--------------------------------------------------------------------------------
Found 1 relevant documents:
1. Intellectual Property Rights (Employment Contracts) - Score: 0.6344
   Snippet: This **INTELLECTUAL** **PROPERTY** **RIGHTS** Clause (“Clause”) forms an integral part of the employment agreement between ABS Company (“Employer”) and Michael Green (“**EMPLOYEE**”), collectively referred to as the “Parties.” 1. Ownership of **INTELLECTUAL** **PROPERTY** Michael Green acknowledges and agrees that all inve...



Searching for: 'How do we handle cookie consent for website visitors?'
--------------------------------------------------------------------------------
Found 1 relevant documents:
1. Cookie Policy (Privacy Policies) - Score: 0.6395
   Snippet: This **COOKIE** Policy explains how ABS Company (“we”, “us”, or “our”) uses **COOKIE**s and similar tracking technologies on our **WEBSITE** to enh

### **Save and Load the Search Engine**

In [12]:
def save_search_engine(index, chunks_df, output_dir='../search_engine'):
    """
    Save the search engine components to disk.
    
    Args:
        index: FAISS index
        chunks_df: DataFrame with document chunks
        output_dir: Directory to save files
    """
    import os
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save FAISS index
    faiss.write_index(index, os.path.join(output_dir, 'document_index.faiss'))
    
    # Save chunks DataFrame (without embeddings to save space)
    save_df = chunks_df.copy()
    if 'embedding' in save_df.columns:
        save_df.drop('embedding', axis=1, inplace=True)
    save_df.to_pickle(os.path.join(output_dir, 'document_chunks.pkl'))
    
    # Save embeddings separately
    embeddings = np.vstack(chunks_df['embedding'].to_numpy())
    np.save(os.path.join(output_dir, 'embeddings.npy'), embeddings)
    
    print(f"Search engine saved to {output_dir}")

def load_search_engine(input_dir='../search_engine'):
    """
    Load the search engine components from disk.
    
    Args:
        input_dir: Directory with saved files
        
    Returns:
        Tuple of (FAISS index, chunks DataFrame)
    """
    import os
    
    # Check if directory exists
    if not os.path.exists(input_dir):
        raise FileNotFoundError(f"Directory not found: {input_dir}")
    
    # Load FAISS index
    index = faiss.read_index(os.path.join(input_dir, 'document_index.faiss'))
    
    # Load chunks DataFrame
    chunks_df = pd.read_pickle(os.path.join(input_dir, 'document_chunks.pkl'))
    
    # Load embeddings
    embeddings = np.load(os.path.join(input_dir, 'embeddings.npy'))
    
    # Add embeddings back to DataFrame
    chunks_df['embedding'] = list(embeddings)
    
    print(f"Loaded search engine with {len(chunks_df)} chunks and {index.ntotal} vectors")
    return index, chunks_df

# Save the search engine
save_search_engine(faiss_index, chunks_df)

# Test loading the search engine (optional)
try:
    loaded_index, loaded_chunks_df = load_search_engine()
    print("Successfully loaded search engine!")
except Exception as e:
    print(f"Error loading search engine: {e}")

Search engine saved to ../search_engine
Loaded search engine with 56 chunks and 56 vectors
Successfully loaded search engine!


### **Integration with LLM for RAG Chatbot (Using OpenAI)**

In [None]:
def improved_rag_chatbot(query, chunks_df, index, model, github_token=None):
    """
    Enhanced RAG-based chatbot that:
    1. Uses improved context builder to get most relevant chunks
    2. Provides source attribution for answers
    3. Returns both the answer and sources used
    
    Args:
        query: User query
        chunks_df: DataFrame with document chunks
        index: FAISS index
        model: SentenceTransformer model
        github_token: GitHub token (if None, will use environment variable)
        
    Returns:
        Dictionary with keys: 'answer', 'sources'
    """
    import os
    
    # Get GitHub token from environment variable if not provided
    if github_token is None:
        github_token = os.environ.get('GITHUB_TOKEN')
    
    if github_token is None:
        return {
            "answer": "GitHub token not provided. Please set GITHUB_TOKEN environment variable or pass token as parameter.",
            "sources": []
        }
    
    try:
        from azure.ai.inference import ChatCompletionsClient
        from azure.ai.inference.models import SystemMessage, UserMessage
        from azure.core.credentials import AzureKeyCredential
        
        # Build context using improved method
        context, source_docs = build_improved_context(query, chunks_df, index, model)
        
        # Create Azure AI Inference client
        client = ChatCompletionsClient(
            endpoint="https://models.github.ai/inference",
            credential=AzureKeyCredential(github_token),
        )
        
        # Enhanced system prompt that encourages citation
        system_prompt = f"""You are a legal assistant for ABS Company. Answer the user's question based ONLY on the context provided below.
        If the answer is not in the context, say "I don't have enough information to answer this question." Do not make up information.
        
        When answering, refer to specific documents by name when you use information from them.
        
        Context:
        {context}"""
        
        # Call GitHub's LLM API
        response = client.complete(
            messages=[
                SystemMessage(system_prompt),
                UserMessage(query)
            ],
            model="openai/gpt-4o-mini",  # Using GitHub's model
            temperature=0.3,
            max_tokens=500,
            top_p=1
        )
        
        answer = response.choices[0].message.content
        
        # Return both the answer and the sources used
        return {
            "answer": answer,
            "sources": source_docs
        }
    
    except ImportError:
        return {
            "answer": "Azure AI Inference package not installed. Use 'pip install azure-ai-inference' to install it.",
            "sources": []
        }
    except Exception as e:
        return {
            "answer": f"Error using GitHub LLM API: {str(e)}",
            "sources": []
        }

In [None]:
# Set your GitHub token in environment variables
import os
# Use your GitHub personal access token here
os.environ['GITHUB_TOKEN'] = ' '  # GitHub token

def demonstrate_improved_rag():
    test_questions = [
        "What are the intellectual property rights for employee created work?",
        "How do we handle cookie consent for website visitors?",
        "Who owns the intellectual property for work I create during my employment?"
    ]
    
    print("Testing Improved RAG Implementation\n")
    
    for question in test_questions:
        print(f"Question: {question}")
        print("-" * 80)
        
        # Get response from improved RAG chatbot
        response = improved_rag_chatbot(question, chunks_df, faiss_index, model)
        
        # Display the answer
        print(f"Answer: {response['answer']}\n")
        
        # Display source documents used
        print(f"Sources ({len(response['sources'])}):")
        for i, source in enumerate(response['sources'], 1):
            print(f"{i}. {source}")
        
        print("\n" + "-" * 80 + "\n")

# Run the demonstration
demonstrate_improved_rag()

Testing Improved RAG Implementation

Question: What are the intellectual property rights for employee created work?
--------------------------------------------------------------------------------
Answer: According to the "Intellectual Property Rights" document, all inventions, discoveries, developments, improvements, processes, designs, works of authorship, trade secrets, patents, copyrights, trademarks, and any other intellectual property (IP) conceived, created, developed, or reduced to practice by Michael Green during his employment with ABS Company, and within the scope of his work or using ABS Company resources, shall be the exclusive property of ABS Company. 

Michael Green is required to assign all rights to this IP to ABS Company immediately upon its creation. He must also disclose any IP created during his employment that relates to ABS Company’s business and assist the company in securing and enforcing IP rights. However, IP created entirely on his own time, without using AB

### ***