# Azure AI Search with Multiple Text Files

This notebook demonstrates how to:
1. Connect to Azure Storage Account
2. Load data from multiple .txt files in a container
3. Process and chunk text data
4. Create and populate Azure AI Search index
5. Set up knowledge agent for text retrieval

In [None]:
# Install required packages if not already installed
# !pip install azure-storage-blob azure-search-documents azure-ai-projects azure-identity python-dotenv requests

In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SearchField, VectorSearch, VectorSearchProfile, 
    HnswAlgorithmConfiguration, AzureOpenAIVectorizer, AzureOpenAIVectorizerParameters,
    SemanticSearch, SemanticConfiguration, SemanticPrioritizedFields, SemanticField,
    KnowledgeAgent, KnowledgeAgentAzureOpenAIModel, KnowledgeAgentTargetIndex, KnowledgeAgentRequestLimits
)
from azure.search.documents import SearchIndexingBufferedSender
import json
import os
import re
import hashlib
from datetime import datetime

In [2]:
# Load environment variables
load_dotenv(override=True)

# Azure Storage Configuration for TXT files
txt_storage_account_name = os.getenv("TXT_STORAGE_ACCOUNT_NAME", os.getenv("AZURE_STORAGE_ACCOUNT_NAME", "your-storage-account"))
txt_storage_account_key = os.getenv("TXT_STORAGE_ACCOUNT_KEY", os.getenv("AZURE_STORAGE_ACCOUNT_KEY"))  
txt_storage_container_name = os.getenv("TXT_STORAGE_CONTAINER_NAME", "txtfiles")  # Different container for txt files
txt_file_prefix = os.getenv("TXT_FILE_PREFIX", "")  # Optional: filter files by prefix

# Text Processing Configuration
chunk_size = int(os.getenv("TEXT_CHUNK_SIZE", "1000"))  # Characters per chunk
chunk_overlap = int(os.getenv("TEXT_CHUNK_OVERLAP", "200"))  # Overlap between chunks

# Azure Search Configuration
search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
search_api_key = os.getenv("AZURE_OPENAI_KEY")

# Use API key if available, otherwise use managed identity
if search_api_key:
    print("🔑 Using Azure Search API Key authentication")
    search_credential = AzureKeyCredential(search_api_key)
else:
    print("🔐 Using Managed Identity authentication")
    managed_identity_client_id = os.getenv("MANAGED_IDENTITY_CLIENT_ID")
    search_credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)

# Azure OpenAI Configuration
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_gpt_deployment = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT", "gpt-4o")
azure_openai_gpt_model = os.getenv("AZURE_OPENAI_GPT_MODEL", "gpt-4o")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-large")

# General Configuration
index_name = os.getenv("TXT_SEARCH_INDEX", "txt_files_index")
agent_name = os.getenv("TXT_SEARCH_AGENT_NAME", "txt-files-agent")

print("✅ Configuration loaded successfully")
print(f"Storage Account: {txt_storage_account_name}")
print(f"Container: {txt_storage_container_name}")
print(f"File Prefix: {txt_file_prefix or 'All .txt files'}")
print(f"Search Index: {index_name}")
print(f"Agent Name: {agent_name}")
print(f"Chunk Size: {chunk_size} characters")
print(f"Chunk Overlap: {chunk_overlap} characters")

🔑 Using Azure Search API Key authentication
✅ Configuration loaded successfully
Storage Account: storageagenticaidemo
Container: earthdata
File Prefix: All .txt files
Search Index: txt_files_index
Agent Name: txt-files-agent
Chunk Size: 1000 characters
Chunk Overlap: 200 characters


In [3]:
# Connect to Azure Storage
def connect_to_txt_storage():
    try:
        if txt_storage_account_key:
            print("🔑 Connecting with Account Key...")
            connection_string = f"DefaultEndpointsProtocol=https;AccountName={txt_storage_account_name};AccountKey={txt_storage_account_key};EndpointSuffix=core.windows.net"
            return BlobServiceClient.from_connection_string(connection_string)
        else:
            print("🔐 Connecting with Managed Identity...")
            account_url = f"https://{txt_storage_account_name}.blob.core.windows.net"
            credential = DefaultAzureCredential()
            return BlobServiceClient(account_url=account_url, credential=credential)
    except Exception as e:
        print(f"❌ Failed to connect to storage: {e}")
        return None

# Establish connection
txt_blob_service_client = connect_to_txt_storage()

if txt_blob_service_client:
    print("✅ Connected to Azure Storage successfully")
    
    # Test connection by listing containers
    try:
        containers = list(txt_blob_service_client.list_containers())
        print(f"📦 Found {len(containers)} containers:")
        for container in containers[:5]:
            print(f"   - {container.name}")
            
        # Check if our target container exists
        container_names = [c.name for c in containers]
        if txt_storage_container_name in container_names:
            print(f"✅ Target container '{txt_storage_container_name}' found")
        else:
            print(f"⚠️  Target container '{txt_storage_container_name}' not found")
            print(f"Available containers: {container_names}")
    except Exception as e:
        print(f"⚠️  Could not list containers: {e}")
else:
    print("❌ Failed to connect to storage. Please check your credentials.")

🔑 Connecting with Account Key...
✅ Connected to Azure Storage successfully
📦 Found 2 containers:
   - demo
   - earthdata
✅ Target container 'earthdata' found


In [4]:
# List and load all .txt files from container
def list_txt_files():
    """List all .txt files in the container"""
    try:
        container_client = txt_blob_service_client.get_container_client(txt_storage_container_name)
        txt_files = []
        
        print(f"📄 Scanning container '{txt_storage_container_name}' for .txt files...")
        
        for blob in container_client.list_blobs():
            # Filter by .txt extension and optional prefix
            if blob.name.lower().endswith('.txt'):
                if not txt_file_prefix or blob.name.startswith(txt_file_prefix):
                    txt_files.append({
                        'name': blob.name,
                        'size': blob.size,
                        'last_modified': blob.last_modified
                    })
        
        print(f"✅ Found {len(txt_files)} .txt files")
        
        # Display file summary
        for file in txt_files[:10]:  # Show first 10 files
            size_kb = file['size'] / 1024
            print(f"   📄 {file['name']} ({size_kb:.1f} KB)")
        
        if len(txt_files) > 10:
            print(f"   ... and {len(txt_files) - 10} more files")
        
        return txt_files
        
    except Exception as e:
        print(f"❌ Error listing .txt files: {e}")
        return []

# Get list of txt files
txt_files = list_txt_files()

📄 Scanning container 'earthdata' for .txt files...
✅ Found 5 .txt files
   📄 Earth_At_Night_Overview.txt (1.2 KB)
   📄 Earth_Night_Ecosystems.txt (0.9 KB)
   📄 Global_Landscape_Earth_At_Night.txt (3.0 KB)
   📄 Human_Activity_At_Night.txt (0.8 KB)
   📄 Night_Imagery_Disaster_Monitoring.txt (0.9 KB)


In [5]:
# Text chunking functions
def clean_text(text):
    """Clean and normalize text content"""
    # Remove extra whitespace and normalize line endings
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\r', '\n', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Normalize paragraph breaks
    text = re.sub(r'[ \t]+', ' ', text)  # Normalize spaces
    return text.strip()

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks"""
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        # Calculate end position
        end = start + chunk_size
        
        # If this is not the last chunk, try to break at sentence or word boundary
        if end < len(text):
            # Look for sentence endings within the last 100 characters
            boundary_search = text[max(0, end-100):end+100]
            sentence_ends = [m.end() for m in re.finditer(r'[.!?]\s+', boundary_search)]
            
            if sentence_ends:
                # Use the last sentence ending within our search area
                relative_pos = sentence_ends[-1]
                end = max(0, end-100) + relative_pos
            else:
                # Fall back to word boundary
                while end < len(text) and text[end] != ' ':
                    end += 1
        
        # Extract chunk
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        # Move start position (with overlap)
        start = end - overlap
        if start >= len(text):
            break
    
    return chunks

def generate_chunk_id(filename, chunk_index, chunk_text):
    """Generate unique ID for each chunk"""
    # Create a hash of the content for uniqueness
    content_hash = hashlib.md5(chunk_text.encode()).hexdigest()[:8]
    # Clean filename for use in ID
    clean_filename = re.sub(r'[^a-zA-Z0-9_-]', '_', filename.replace('.txt', ''))
    return f"{clean_filename}_chunk_{chunk_index:04d}_{content_hash}"

print("✅ Text processing functions loaded")

✅ Text processing functions loaded


In [7]:
# Load and process all text files
def load_and_process_txt_files():
    """Load all txt files and convert to searchable documents"""
    if not txt_files:
        print("❌ No .txt files found to process")
        return []
    
    all_documents = []
    total_files = len(txt_files)
    
    print(f"📥 Processing {total_files} text files...")
    
    for i, file_info in enumerate(txt_files, 1):
        filename = file_info['name']
        print(f"\n📄 Processing file {i}/{total_files}: {filename}")
        
        try:
            # Download file content
            blob_client = txt_blob_service_client.get_blob_client(
                container=txt_storage_container_name,
                blob=filename
            )
            
            # Get text content
            blob_data = blob_client.download_blob()
            raw_text = blob_data.readall().decode('utf-8', errors='ignore')
            
            # Clean the text
            cleaned_text = clean_text(raw_text)
            
            print(f"   📏 Text length: {len(cleaned_text):,} characters")
            
            # Split into chunks
            chunks = chunk_text(cleaned_text, chunk_size, chunk_overlap)
            print(f"   ✂️  Created {len(chunks)} chunks")
            
            # Convert chunks to documents
            for chunk_idx, chunk_content in enumerate(chunks):
                doc = {
                    'id': generate_chunk_id(filename, chunk_idx, chunk_content),
                    'content': chunk_content,
                    'source_file': filename,
                    'chunk_index': chunk_idx,
                    'file_size': file_info['size'],
                    'file_modified': file_info['last_modified'].isoformat() if file_info['last_modified'] else None,
                    'processed_date': datetime.now().isoformat(),
                    'chunk_length': len(chunk_content),
                    'file_type': 'text'
                }
                all_documents.append(doc)
            
        except Exception as e:
            print(f"   ❌ Error processing {filename}: {e}")
            continue
    
    print(f"\n✅ Successfully processed {total_files} files into {len(all_documents)} searchable documents")
    
    # Summary statistics
    if all_documents:
        total_chars = sum(doc['chunk_length'] for doc in all_documents)
        avg_chunk_size = total_chars / len(all_documents)
        unique_files = len(set(doc['source_file'] for doc in all_documents))
        
        print(f"\n📊 Processing Summary:")
        print(f"   📄 Files processed: {unique_files}")
        print(f"   📋 Total chunks: {len(all_documents)}")
        print(f"   📏 Total characters: {total_chars:,}")
        print(f"   📐 Average chunk size: {avg_chunk_size:.0f} characters")
        
        # Show sample document
        print(f"\n📝 Sample document:")
        sample_doc = all_documents[0]
        print(f"   ID: {sample_doc['id']}")
        print(f"   Source: {sample_doc['source_file']}")
        print(f"   Content preview: {sample_doc['content'][:200]}...")
    
    return all_documents

# Process all files
documents = load_and_process_txt_files()

📥 Processing 5 text files...

📄 Processing file 1/5: Earth_At_Night_Overview.txt
   📏 Text length: 1,203 characters
   ✂️  Created 2 chunks

📄 Processing file 2/5: Earth_Night_Ecosystems.txt
   📏 Text length: 883 characters
   ✂️  Created 1 chunks

📄 Processing file 3/5: Global_Landscape_Earth_At_Night.txt
   📏 Text length: 3,013 characters
   ✂️  Created 4 chunks

📄 Processing file 4/5: Human_Activity_At_Night.txt
   📏 Text length: 781 characters
   ✂️  Created 1 chunks

📄 Processing file 5/5: Night_Imagery_Disaster_Monitoring.txt
   📏 Text length: 884 characters
   ✂️  Created 1 chunks

✅ Successfully processed 5 files into 9 searchable documents

📊 Processing Summary:
   📄 Files processed: 5
   📋 Total chunks: 9
   📏 Total characters: 7,557
   📐 Average chunk size: 840 characters

📝 Sample document:
   ID: Earth_At_Night_Overview_chunk_0000_54014950
   Source: Earth_At_Night_Overview.txt
   Content preview: Title: Earth at Night - A Scientific Overview

The phenomenon of Earth at ni

In [8]:
# Create Azure AI Search Index for text documents
def create_txt_search_index():
    try:
        print(f"🔍 Creating search index: {index_name}")
        
        index = SearchIndex(
            name=index_name,
            fields=[
                # Required key field
                SearchField(name="id", type="Edm.String", key=True, filterable=True, sortable=True),
                
                # Main content field
                SearchField(name="content", type="Edm.String", searchable=True, filterable=False),
                
                # Vector embedding field
                SearchField(name="content_vector", type="Collection(Edm.Single)", stored=False, 
                           vector_search_dimensions=3072, vector_search_profile_name="hnsw_text_3_large"),
                
                # Metadata fields
                SearchField(name="source_file", type="Edm.String", filterable=True, sortable=True, facetable=True),
                SearchField(name="chunk_index", type="Edm.Int32", filterable=True, sortable=True),
                SearchField(name="file_size", type="Edm.Int64", filterable=True, sortable=True),
                SearchField(name="file_modified", type="Edm.DateTimeOffset", filterable=True, sortable=True),
                SearchField(name="processed_date", type="Edm.DateTimeOffset", filterable=True, sortable=True),
                SearchField(name="chunk_length", type="Edm.Int32", filterable=True, sortable=True),
                SearchField(name="file_type", type="Edm.String", filterable=True, facetable=True)
            ],
            vector_search=VectorSearch(
                profiles=[
                    VectorSearchProfile(
                        name="hnsw_text_3_large", 
                        algorithm_configuration_name="alg", 
                        vectorizer_name="azure_openai_text_3_large"
                    )
                ],
                algorithms=[HnswAlgorithmConfiguration(name="alg")],
                vectorizers=[
                    AzureOpenAIVectorizer(
                        vectorizer_name="azure_openai_text_3_large",
                        parameters=AzureOpenAIVectorizerParameters(
                            resource_url=azure_openai_endpoint,
                            deployment_name=azure_openai_embedding_deployment,
                            model_name=azure_openai_embedding_model
                        )
                    )
                ]
            ),
            semantic_search=SemanticSearch(
                default_configuration_name="semantic_config",
                configurations=[
                    SemanticConfiguration(
                        name="semantic_config",
                        prioritized_fields=SemanticPrioritizedFields(
                            content_fields=[
                                SemanticField(field_name="content")
                            ],
                            keywords_fields=[
                                SemanticField(field_name="source_file")
                            ]
                        )
                    )
                ]
            )
        )

        index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
        result = index_client.create_or_update_index(index)
        print(f"✅ Index '{index_name}' created or updated successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to create index: {e}")
        return False

# Create the index
index_created = create_txt_search_index()

🔍 Creating search index: txt_files_index
✅ Index 'txt_files_index' created or updated successfully


In [9]:
# Upload documents to the search index
def upload_txt_documents_to_index():
    if not documents:
        print("❌ No documents to upload")
        return False
        
    if not index_created:
        print("❌ Index not created, cannot upload documents")
        return False
    
    try:
        print(f"📤 Uploading {len(documents)} documents to index...")
        
        # Process documents in batches for better performance
        batch_size = 100
        total_batches = (len(documents) + batch_size - 1) // batch_size
        
        with SearchIndexingBufferedSender(
            endpoint=search_endpoint, 
            index_name=index_name, 
            credential=search_credential,
            auto_flush_interval=30  # Auto-flush every 30 seconds
        ) as client:
            for i in range(0, len(documents), batch_size):
                batch = documents[i:i + batch_size]
                batch_num = (i // batch_size) + 1
                
                print(f"   📦 Processing batch {batch_num}/{total_batches} ({len(batch)} documents)")
                client.upload_documents(documents=batch)
        
        print(f"✅ All {len(documents)} documents uploaded to index '{index_name}' successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload documents: {e}")
        return False

# Upload the documents
documents_uploaded = upload_txt_documents_to_index()

📤 Uploading 9 documents to index...
   📦 Processing batch 1/1 (9 documents)
✅ All 9 documents uploaded to index 'txt_files_index' successfully


In [10]:
# Create Knowledge Agent for text files
def create_txt_knowledge_agent():
    if not documents_uploaded:
        print("❌ Documents not uploaded, cannot create agent")
        return False
    
    try:
        print(f"🤖 Creating knowledge agent: {agent_name}")
        
        agent = KnowledgeAgent(
            name=agent_name,
            models=[
                KnowledgeAgentAzureOpenAIModel(
                    azure_open_ai_parameters=AzureOpenAIVectorizerParameters(
                        resource_url=azure_openai_endpoint,
                        deployment_name=azure_openai_gpt_deployment,
                        model_name=azure_openai_gpt_model
                    )
                )
            ],
            target_indexes=[
                KnowledgeAgentTargetIndex(
                    index_name=index_name,
                    default_reranker_threshold=2.0  # Slightly lower threshold for text similarity
                )
            ],
            request_limits=KnowledgeAgentRequestLimits(
                max_output_size=15000  # Larger output for text processing
            )
        )

        index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
        result = index_client.create_or_update_agent(agent)
        print(f"✅ Knowledge agent '{agent_name}' created or updated successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to create knowledge agent: {e}")
        return False

# Create the agent
agent_created = create_txt_knowledge_agent()

🤖 Creating knowledge agent: txt-files-agent
✅ Knowledge agent 'txt-files-agent' created or updated successfully


In [11]:
# Test the setup with multiple search scenarios
def test_txt_search_functionality():
    try:
        from azure.search.documents import SearchClient
        
        print("🔍 Testing text search functionality...")
        
        search_client = SearchClient(
            endpoint=search_endpoint,
            index_name=index_name,
            credential=search_credential
        )
        
        # Test 1: Simple text search
        print("\n🔎 Test 1: Simple text search")
        results = search_client.search("important information", top=3)
        
        print(f"\n🎯 Search Results for 'important information':")
        for i, result in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"   File: {result.get('source_file', 'N/A')}")
            print(f"   Chunk: {result.get('chunk_index', 'N/A')}")
            print(f"   Content: {result.get('content', 'N/A')[:150]}...")
            print(f"   Score: {result.get('@search.score', 'N/A'):.3f}")
        
        # Test 2: Search with file filter
        if len(txt_files) > 1:
            print("\n🔎 Test 2: Search with file filter")
            first_file = txt_files[0]['name']
            results = search_client.search(
                "*", 
                top=2,
                filter=f"source_file eq '{first_file}'"
            )
            
            print(f"\n📄 Results from file '{first_file}':")
            for i, result in enumerate(results, 1):
                print(f"\nResult {i}:")
                print(f"   Chunk: {result.get('chunk_index', 'N/A')}")
                print(f"   Length: {result.get('chunk_length', 'N/A')} chars")
                print(f"   Content: {result.get('content', 'N/A')[:100]}...")
        
        # Test 3: Faceted search by file type
        print("\n🔎 Test 3: Document statistics")
        results = search_client.search(
            "*", 
            top=0,  # We only want facets
            facets=["source_file", "file_type"]
        )
        
        facets = results.get_facets()
        if 'source_file' in facets:
            print("\n📊 Documents per file:")
            for facet in facets['source_file'][:5]:  # Top 5 files
                print(f"   📄 {facet['value']}: {facet['count']} chunks")
        
        return True
        
    except Exception as e:
        print(f"❌ Search test failed: {e}")
        return False

# Run search tests
if documents_uploaded:
    test_txt_search_functionality()
else:
    print("⏩ Skipping search test - documents not uploaded")

🔍 Testing text search functionality...

🔎 Test 1: Simple text search

🎯 Search Results for 'important information':

🔎 Test 2: Search with file filter

📄 Results from file 'Earth_At_Night_Overview.txt':

🔎 Test 3: Document statistics

📊 Documents per file:


## 🎯 Summary

This notebook demonstrates how to:

1. **Connect to Azure Storage** and scan for .txt files
2. **Load and process multiple text files** from a container
3. **Intelligently chunk text** with proper boundaries
4. **Create Azure AI Search index** optimized for text content
5. **Upload processed text chunks** as searchable documents
6. **Create a knowledge agent** for text retrieval
7. **Test search functionality** with various filters

## 📝 Required Environment Variables

Add these to your `.env` file:

```env
# Text Files Storage (can reuse existing storage account)
TXT_STORAGE_ACCOUNT_NAME=your-storage-account
TXT_STORAGE_ACCOUNT_KEY=your-storage-key
TXT_STORAGE_CONTAINER_NAME=txtfiles
TXT_FILE_PREFIX=documents/  # Optional: filter files by prefix

# Text Processing
TEXT_CHUNK_SIZE=1000
TEXT_CHUNK_OVERLAP=200

# Search Configuration
TXT_SEARCH_INDEX=txt_files_index
TXT_SEARCH_AGENT_NAME=txt-files-agent

# Azure Services (existing)
AZURE_SEARCH_ENDPOINT=your-search-endpoint
AZURE_SEARCH_API_KEY=your-search-key
AZURE_OPENAI_ENDPOINT=your-openai-endpoint
```

## 🚀 Features

- **Smart Text Chunking**: Breaks at sentence boundaries when possible
- **Metadata Tracking**: Tracks source file, chunk index, and processing dates
- **Batch Processing**: Efficiently handles large numbers of files
- **Error Handling**: Continues processing even if individual files fail
- **Search Optimization**: Vector and semantic search for best results
- **Filtering Support**: Search within specific files or date ranges

In [12]:
# Utility functions for file management
def get_txt_file_statistics():
    """Get detailed statistics about processed files"""
    if not documents:
        print("❌ No documents processed yet")
        return
    
    # Group by source file
    file_stats = {}
    for doc in documents:
        filename = doc['source_file']
        if filename not in file_stats:
            file_stats[filename] = {
                'chunks': 0,
                'total_chars': 0,
                'file_size': doc['file_size'],
                'processed_date': doc['processed_date']
            }
        file_stats[filename]['chunks'] += 1
        file_stats[filename]['total_chars'] += doc['chunk_length']
    
    print("📊 File Processing Statistics:")
    print(f"   Total files: {len(file_stats)}")
    print(f"   Total chunks: {len(documents)}")
    
    print("\n📄 Per-file breakdown:")
    for filename, stats in sorted(file_stats.items()):
        avg_chunk_size = stats['total_chars'] / stats['chunks']
        print(f"   {filename}:")
        print(f"     📋 Chunks: {stats['chunks']}")
        print(f"     📏 Total chars: {stats['total_chars']:,}")
        print(f"     📐 Avg chunk: {avg_chunk_size:.0f} chars")
        print(f"     💾 File size: {stats['file_size']:,} bytes")

def search_specific_file(filename, query, top=3):
    """Search within a specific file"""
    try:
        from azure.search.documents import SearchClient
        
        search_client = SearchClient(
            endpoint=search_endpoint,
            index_name=index_name,
            credential=search_credential
        )
        
        results = search_client.search(
            query,
            top=top,
            filter=f"source_file eq '{filename}'"
        )
        
        print(f"🔍 Search results for '{query}' in {filename}:")
        for i, result in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"   Chunk {result.get('chunk_index', 'N/A')}")
            print(f"   Score: {result.get('@search.score', 'N/A'):.3f}")
            print(f"   Content: {result.get('content', 'N/A')[:200]}...")
            
    except Exception as e:
        print(f"❌ Search failed: {e}")

# Show statistics
if documents:
    get_txt_file_statistics()

📊 File Processing Statistics:
   Total files: 5
   Total chunks: 9

📄 Per-file breakdown:
   Earth_At_Night_Overview.txt:
     📋 Chunks: 2
     📏 Total chars: 1,401
     📐 Avg chunk: 700 chars
     💾 File size: 1,210 bytes
   Earth_Night_Ecosystems.txt:
     📋 Chunks: 1
     📏 Total chars: 883
     📐 Avg chunk: 883 chars
     💾 File size: 888 bytes
   Global_Landscape_Earth_At_Night.txt:
     📋 Chunks: 4
     📏 Total chars: 3,608
     📐 Avg chunk: 902 chars
     💾 File size: 3,023 bytes
   Human_Activity_At_Night.txt:
     📋 Chunks: 1
     📏 Total chars: 781
     📐 Avg chunk: 781 chars
     💾 File size: 782 bytes
   Night_Imagery_Disaster_Monitoring.txt:
     📋 Chunks: 1
     📏 Total chars: 884
     📐 Avg chunk: 884 chars
     💾 File size: 885 bytes
