# Implementing Document Upload for Your Search Engine in a New Notebook



Create a complete implementation for you that does the following:
1. Imports your existing search engine
2. Adds functionality to upload and process new DOCX files
3. Updates the index with new documents
4. Provides search capabilities on the expanded document set





In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import faiss
import re
import docx
import time
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pickle
import logging
import json

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('document_upload_system')

# Add the path to your original implementation if needed
# sys.path.append('../path/to/original/implementation')

# Step 1: Load the existing search engine
def load_search_engine(input_dir='../optimized_search_engine'):
    """
    Load a complete search engine from disk.
    
    Args:
        input_dir: Directory containing the search engine
        
    Returns:
        Initialized SemanticSearchEngine instance
    """
    if not os.path.exists(input_dir):
        logger.error(f"Directory not found: {input_dir}")
        return None
    
    try:
        # Load model name and initialize
        with open(os.path.join(input_dir, 'model_name.txt'), 'r') as f:
            model_name = f.read().strip()
        
        model = SentenceTransformer(model_name)
        
        # Load chunks
        chunks_path = os.path.join(input_dir, 'chunks_df.pkl')
        chunks_df = pd.read_pickle(chunks_path)
        
        # Load embeddings if they exist
        embeddings_path = os.path.join(input_dir, 'embeddings.npy')
        if os.path.exists(embeddings_path):
            embeddings = np.load(embeddings_path)
            # Add embeddings back to DataFrame
            chunks_df['embedding'] = list(embeddings)
        
        # Load FAISS index
        index_path = os.path.join(input_dir, 'optimized_index.faiss')
        index = faiss.read_index(index_path)
        
        # Creating a dictionary with all components
        engine_components = {
            'chunks_df': chunks_df,
            'index': index,
            'model': model,
            'model_name': model_name,
            'engine_dir': input_dir
        }
        
        # Import SemanticSearchEngine class
        # We'll recreate a simplified version for this example
        class SemanticSearchEngine:
            def __init__(self, chunks_df=None, index=None, model=None):
                self.chunks_df = chunks_df
                self.index = index
                self.model = model
                self.metadata = {}
            
            def search(self, query, method=None, params=None, categories=None, auto_select=True):
                logger.info(f"Searching for: {query}")
                # This is a placeholder - actual implementation would use the existing code
                # In a real implementation, you'd import the search functionality
                
                # Here we'd call: return search_documents(query, self.index, self.chunks_df, self.model)
                return pd.DataFrame({"message": ["Search functionality would be implemented here"]})
        
        # Create search engine
        engine = SemanticSearchEngine(
            chunks_df=chunks_df,
            index=index,
            model=model
        )
        
        logger.info(f"Search engine loaded with {len(chunks_df)} chunks and {index.ntotal} vectors")
        return engine, engine_components
        
    except Exception as e:
        logger.error(f"Error loading search engine: {e}")
        return None, None

# Step 2: Document processing functions
def extract_text_from_docx(file_path):
    """Extract text content from a .docx file."""
    try:
        doc = docx.Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            if para.text.strip():  # Skip empty paragraphs
                full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return ""

def preprocess_text(text):
    """Clean and preprocess text."""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Step 3: Document chunking (semantic chunking from original implementation)
def semantic_chunking(doc_df, min_chunk_size=100, max_chunk_size=300):
    """
    Split documents into chunks based on semantic boundaries like paragraphs and sections.
    
    Args:
        doc_df: DataFrame with document data
        min_chunk_size: Minimum words per chunk
        max_chunk_size: Maximum words per chunk
    
    Returns:
        DataFrame with document chunks
    """
    chunks = []
    chunk_id_start = 0
    
    # Get the highest existing chunk_id to avoid duplicates
    if 'existing_chunks_df' in globals() and existing_chunks_df is not None:
        if len(existing_chunks_df) > 0:
            chunk_id_start = existing_chunks_df['chunk_id'].max() + 1
    
    # Patterns for section boundaries
    section_patterns = [
        r'\n#{1,3}\s+.+\n',  # Markdown headers
        r'\n\d+\.\s+[A-Z]',  # Numbered sections starting with capital letter
        r'\n[A-Z][A-Z\s]+\n',  # All caps section titles
        r'\n[A-Z][a-z]+\s+\d+[\\.:]\\s+',  # Article X: style headers
        r'\n\s*SECTION\s+\d+',  # SECTION X style headers
    ]
    
    compiled_patterns = [re.compile(pattern) for pattern in section_patterns]
    
    for _, doc in tqdm(doc_df.iterrows(), total=len(doc_df), desc="Creating semantic chunks"):
        text = doc['text']
        
        # Find all potential section boundaries
        boundaries = []
        for pattern in compiled_patterns:
            for match in pattern.finditer(text):
                boundaries.append(match.start())
        
        # Sort boundaries and add start/end of document
        boundaries = sorted(list(set([0] + boundaries + [len(text)])))
        
        # Determine total chunks for position scoring
        total_chunks = len(boundaries) - 1
        
        # Create chunks based on boundaries
        chunk_id = chunk_id_start
        for i in range(len(boundaries) - 1):
            chunk_text = text[boundaries[i]:boundaries[i+1]].strip()
            
            # Skip empty chunks
            if not chunk_text:
                continue
                
            # Determine position information
            if i < total_chunks / 3:
                position = "beginning"
                position_score = 0.9  # Prefer beginning chunks slightly
            elif i > 2 * total_chunks / 3:
                position = "end"
                position_score = 0.7
            else:
                position = "middle"
                position_score = 0.8
                
            # Check if chunk is too small or too large
            words = chunk_text.split()
            if len(words) < min_chunk_size:
                # If too small, combine with next chunk if possible
                if i < len(boundaries) - 2:
                    continue
            
            # If too large, split into smaller chunks
            if len(words) > max_chunk_size:
                sub_chunks = [' '.join(words[j:j+max_chunk_size]) 
                            for j in range(0, len(words), max_chunk_size)]
                
                # Handle subchunks with appropriate position scoring
                for sub_idx, sub_chunk in enumerate(sub_chunks):
                    # Adjust position for subchunks
                    if sub_idx == 0:
                        sub_position = position
                        sub_position_score = position_score
                    elif sub_idx == len(sub_chunks) - 1:
                        sub_position = position
                        sub_position_score = position_score * 0.9  # Slightly lower
                    else:
                        sub_position = "middle"
                        sub_position_score = position_score * 0.8  # Lower for middle subchunks
                    
                    chunks.append({
                        'chunk_id': chunk_id,
                        'doc_id': doc['id'],
                        'doc_name': doc['name'],
                        'category': doc['category'],
                        'text': sub_chunk,
                        'chunk_method': 'semantic_split',
                        'start_idx': boundaries[i],
                        'end_idx': boundaries[i+1],
                        'document_position': sub_position,
                        'position_score': sub_position_score
                    })
                    chunk_id += 1
            else:
                # Add as a single chunk
                chunks.append({
                    'chunk_id': chunk_id,
                    'doc_id': doc['id'],
                    'doc_name': doc['name'],
                    'category': doc['category'],
                    'text': chunk_text,
                    'chunk_method': 'semantic',
                    'start_idx': boundaries[i],
                    'end_idx': boundaries[i+1],
                    'document_position': position,
                    'position_score': position_score
                })
                chunk_id += 1
    
    # Create DataFrame of chunks
    new_chunks_df = pd.DataFrame(chunks)
    logger.info(f"Created {len(new_chunks_df)} semantic chunks from {len(doc_df)} documents")
    return new_chunks_df

# Step 4: Function to upload and process new documents
def upload_documents(file_paths, categories, engine_components):
    """
    Upload, process and index new documents.
    
    Args:
        file_paths: List of paths to new document files
        categories: List of categories for each document
        engine_components: Dictionary with engine components
        
    Returns:
        Updated engine components
    """
    # Check that file_paths and categories have same length
    if len(file_paths) != len(categories):
        logger.error("The number of file paths must match the number of categories")
        return engine_components
    
    # Extract components
    chunks_df = engine_components['chunks_df']
    index = engine_components['index']
    model = engine_components['model']
    
    # Get highest doc_id to avoid duplicates
    next_doc_id = 0
    if len(chunks_df) > 0:
        next_doc_id = chunks_df['doc_id'].max() + 1
    
    # Process new documents
    new_docs = []
    for i, file_path in enumerate(file_paths):
        if not os.path.exists(file_path):
            logger.warning(f"File not found: {file_path}")
            continue
            
        # Extract document name from file path
        doc_name = os.path.splitext(os.path.basename(file_path))[0].replace('_', ' ')
        
        # Extract and preprocess text
        text = extract_text_from_docx(file_path)
        text = preprocess_text(text)
        
        if not text:
            logger.warning(f"No text content found in {file_path}")
            continue
        
        # Add document to list
        new_docs.append({
            'id': next_doc_id,
            'name': doc_name,
            'category': categories[i],
            'text': text,
            'file_path': file_path
        })
        next_doc_id += 1
    
    # Create DataFrame for new documents
    new_docs_df = pd.DataFrame(new_docs)
    logger.info(f"Processed {len(new_docs_df)} new documents")
    
    if len(new_docs_df) == 0:
        logger.warning("No valid documents to add")
        return engine_components
    
    # Create chunks for new documents
    global existing_chunks_df
    existing_chunks_df = chunks_df
    new_chunks_df = semantic_chunking(new_docs_df)
    
    # Generate embeddings for new chunks
    logger.info("Generating embeddings for new chunks...")
    new_texts = new_chunks_df['text'].tolist()
    
    # Batch processing for embeddings
    batch_size = 32
    new_embeddings = np.zeros((len(new_texts), model.get_sentence_embedding_dimension()), dtype=np.float32)
    
    for i in tqdm(range(0, len(new_texts), batch_size), desc="Batch encoding"):
        batch_texts = new_texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
        new_embeddings[i:i+len(batch_texts)] = batch_embeddings
    
    # Add embeddings to DataFrame
    new_chunks_df['embedding'] = list(new_embeddings)
    
    # Update the index with new embeddings
    normalized_embeddings = new_embeddings.copy()
    faiss.normalize_L2(normalized_embeddings)
    
    # Add vectors to index
    index.add(normalized_embeddings)
    logger.info(f"Added {len(new_embeddings)} vectors to the index")
    
    # Combine new chunks with existing chunks
    updated_chunks_df = pd.concat([chunks_df, new_chunks_df], ignore_index=False)
    logger.info(f"Updated chunks DataFrame now has {len(updated_chunks_df)} chunks")
    
    # Update engine components
    engine_components['chunks_df'] = updated_chunks_df
    engine_components['index'] = index
    
    return engine_components

# Step 5: Function to save the updated search engine
def save_updated_engine(engine_components, output_dir=None):
    """
    Save the updated search engine to disk.
    
    Args:
        engine_components: Dictionary with engine components
        output_dir: Directory to save the search engine (defaults to original location)
    """
    if output_dir is None:
        output_dir = engine_components.get('engine_dir', '../updated_search_engine')
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract components
    chunks_df = engine_components['chunks_df']
    index = engine_components['index']
    model_name = engine_components['model_name']
    
    # Save FAISS index
    index_path = os.path.join(output_dir, 'optimized_index.faiss')
    faiss.write_index(index, index_path)
    
    # Save chunks DataFrame (without embeddings)
    chunks_path = os.path.join(output_dir, 'chunks_df.pkl')
    save_df = chunks_df.copy()
    
    if 'embedding' in save_df.columns:
        # Save embeddings separately
        embeddings = np.vstack(save_df['embedding'].values)
        embeddings_path = os.path.join(output_dir, 'embeddings.npy')
        np.save(embeddings_path, embeddings)
        # Remove embeddings from DataFrame to save space
        save_df = save_df.drop('embedding', axis=1)
    
    save_df.to_pickle(chunks_path)
    
    # Save model name
    with open(os.path.join(output_dir, 'model_name.txt'), 'w') as f:
        f.write(model_name)
    
    # Save configuration
    config = {
        'saved_date': time.strftime('%Y-%m-%d %H:%M:%S'),
        'chunk_count': len(chunks_df),
        'document_count': len(chunks_df['doc_id'].unique()),
        'categories': chunks_df['category'].unique().tolist(),
        'index_size': index.ntotal,
        'embedding_dim': embeddings.shape[1] if 'embeddings' in locals() else None,
    }
    
    with open(os.path.join(output_dir, 'search_engine_config.json'), 'w') as f:
        json.dump(config, f, indent=2)
    
    logger.info(f"Updated search engine saved to {output_dir}")
    return output_dir

# Step 6: Create a SemanticSearchEngine instance with document upload capabilities
class UpgradedSearchEngine:
    """
    Enhanced search engine with document upload capabilities.
    """
    
    def __init__(self, engine_path='../optimized_search_engine'):
        """
        Initialize by loading an existing search engine.
        
        Args:
            engine_path: Path to the existing search engine
        """
        self.engine, self.components = load_search_engine(engine_path)
        
        if self.engine is None:
            logger.error("Failed to load search engine")
        else:
            logger.info("Upgraded search engine initialized successfully")
    
    def search(self, query, method=None, params=None, categories=None, auto_select=True):
        """
        Search using the underlying search engine.
        """
        if self.engine is None:
            logger.error("Search engine not initialized")
            return pd.DataFrame()
        
        return self.engine.search(
            query=query, 
            method=method,
            params=params,
            categories=categories,
            auto_select=auto_select
        )
    
    def upload_new_documents(self, file_paths, categories):
        """
        Upload and process new documents.
        
        Args:
            file_paths: List of paths to document files
            categories: List of categories for the documents
            
        Returns:
            True if successful, False otherwise
        """
        if self.engine is None or self.components is None:
            logger.error("Search engine not initialized")
            return False
        
        try:
            # Process and index new documents
            updated_components = upload_documents(file_paths, categories, self.components)
            
            # Save updated engine
            save_path = save_updated_engine(updated_components)
            
            # Reload the engine with new documents
            self.engine, self.components = load_search_engine(save_path)
            
            logger.info(f"Successfully added {len(file_paths)} documents to the search engine")
            return True
            
        except Exception as e:
            logger.error(f"Error uploading documents: {e}")
            return False
    
    def get_available_categories(self):
        """Get list of available document categories."""
        if self.engine is None or self.components is None:
            return []
        
        chunks_df = self.components.get('chunks_df')
        if chunks_df is None:
            return []
            
        return sorted(chunks_df['category'].unique().tolist())
    
    def get_document_stats(self):
        """Get statistics about the documents in the search engine."""
        if self.engine is None or self.components is None:
            return {}
        
        chunks_df = self.components.get('chunks_df')
        if chunks_df is None:
            return {}
        
        # Count documents per category
        doc_counts = chunks_df.groupby('category')['doc_id'].nunique().to_dict()
        
        # Count chunks per category
        chunk_counts = chunks_df['category'].value_counts().to_dict()
        
        # Total stats
        stats = {
            'total_documents': chunks_df['doc_id'].nunique(),
            'total_chunks': len(chunks_df),
            'categories': self.get_available_categories(),
            'documents_per_category': doc_counts,
            'chunks_per_category': chunk_counts,
            'index_size': self.components.get('index').ntotal if self.components.get('index') else 0
        }
        
        return stats



## 2. Example Usage in a New Notebook

Here's how would use this implementation in a new notebook:



In [None]:
# Import the upgraded search engine
from document_upload_system import UpgradedSearchEngine

# Initialize the search engine
search_engine = UpgradedSearchEngine('../optimized_search_engine')

# Display available categories
print("Current document categories:")
print(search_engine.get_available_categories())

# View statistics about the current documents
stats = search_engine.get_document_stats()
print(f"\nTotal documents: {stats['total_documents']}")
print(f"Total chunks: {stats['total_chunks']}")
print("\nDocuments per category:")
for category, count in stats['documents_per_category'].items():
    print(f"  {category}: {count}")

# Upload new documents
new_files = [
    'C:/path/to/New_Employment_Contract.docx',
    'C:/path/to/Updated_Privacy_Policy.docx',
    'C:/path/to/Vendor_Agreement_2025.docx'
]

new_categories = [
    'Employment Contracts',
    'Privacy Policies',
    'Commercial Agreements'
]

success = search_engine.upload_new_documents(new_files, new_categories)
if success:
    print("\nDocuments successfully added!")
    
    # View updated statistics
    updated_stats = search_engine.get_document_stats()
    print(f"\nUpdated total documents: {updated_stats['total_documents']}")
    print(f"Updated total chunks: {updated_stats['total_chunks']}")
    
    # Search including the new documents
    query = "What are the new privacy requirements for customer data?"
    results = search_engine.search(query, auto_select=True)
    
    print(f"\nSearch results for '{query}':")
    display(results)
else:
    print("Failed to add documents")



## 3. Additional Features Can Implement

1. **Document Versioning**: Track document versions when updated versions of the same document are uploaded.

2. **Scheduled Indexing**: Set up a background process that checks a designated folder and automatically indexes new documents.

3. **Document Validation**: Add validation checks to ensure documents meet specific criteria before being added to the index.

4. **Incremental Updates**: Optimize the process to only update what's necessary when adding new documents.

5. **Web Interface**: Create a simple web interface using Streamlit or Gradio for drag-and-drop document uploading.



