## Environment Setup

In [None]:
# Environment Setup and Path Configuration
import sys
from pathlib import Path
import json
import requests

print(f"Python Version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
print(f"Environment: {sys.executable}")

# Find project root and add to Python path
current_dir = Path.cwd()
if current_dir.name == "opensearch" and current_dir.parent.name == "notebooks":
    project_root = current_dir.parent.parent
elif (current_dir / "compose.yml").exists():
    project_root = current_dir
else:
    project_root = None

if project_root and (project_root / "compose.yml").exists():
    print(f"Project root: {project_root}")
    sys.path.insert(0, str(project_root))
else:
    print("Missing compose.yml - check directory")
    exit()

## 1. Infrastructure Verification

In [None]:
# Service Health Verification
print("PREREQUISITE CHECK")
print("=" * 50)

services_to_test = {
    "FastAPI": "http://localhost:8000/api/v1/health",
    "PostgreSQL (via API)": "http://localhost:8000/api/v1/health", 
    "OpenSearch": "http://localhost:9200/_cluster/health",
    "Airflow": "http://localhost:8080/health"  
}

all_healthy = True

for service_name, url in services_to_test.items():
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"✓ {service_name}: Healthy")
        else:
            print(f"✗ {service_name}: HTTP {response.status_code}")
            all_healthy = False
    except requests.exceptions.ConnectionError:
        print(f"✗ {service_name}: Not accessible")
        all_healthy = False
    except Exception as e:
        print(f"✗ {service_name}: {type(e).__name__}")
        all_healthy = False

print()
if all_healthy:
    print("All services healthy! Ready for OpenSearch integration.")
else:
    print("Some services need attention. Please run: docker compose up --build")

## 2. OpenSearch Client Setup

In [None]:
# OpenSearch Client Setup
from src.services.opensearch.factory import make_opensearch_client
from opensearchpy import OpenSearch

print("OPENSEARCH CLIENT SETUP")
print("=" * 40)

# Create OpenSearch client using factory pattern
opensearch_client = make_opensearch_client()

# Override for notebook execution (localhost instead of container hostname)
opensearch_client.host = "http://localhost:9200"
opensearch_client.client = OpenSearch(
    hosts=["http://localhost:9200"],
    http_compress=True,
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

print(f"Client configured with host: {opensearch_client.host}")
print(f"Index name: {opensearch_client.index_name}")

# Test health check
is_healthy = opensearch_client.health_check()
if is_healthy:
    print("✓ OpenSearch health check: PASSED")
else:
    print("✗ OpenSearch health check: FAILED")

## Index Configuration

In [None]:
# Display Index Configuration
from src.services.opensearch.index_config import ARXIV_PAPERS_INDEX, ARXIV_PAPERS_MAPPING

print("INDEX CONFIGURATION")
print("=" * 40)
print(f"Index Name: {ARXIV_PAPERS_INDEX}")
print(f"\nKey Features:")
print("• Custom text analyzers for better search")
print("• Multi-field mapping (text + keyword)")
print("• 10 specialized fields for papers")
print("\nField Types:")

properties = ARXIV_PAPERS_MAPPING["mappings"]["properties"]
for field_name, config in properties.items():
    field_type = config.get("type")
    analyzer = config.get("analyzer", "")
    if analyzer:
        print(f"  • {field_name}: {field_type} [{analyzer}]")
    else:
        print(f"  • {field_name}: {field_type}")

### Create Index

In [None]:
# Create Index if it doesn't exist
print("INDEX CREATION")
print("=" * 40)

try:
    # Check if index already exists
    index_exists = opensearch_client.client.indices.exists(index=opensearch_client.index_name)
    
    if index_exists:
        print(f"✓ Index '{opensearch_client.index_name}' already exists")
        
        # Get current index statistics
        stats = opensearch_client.get_index_stats()
        if stats and 'error' not in stats:
            print(f"\nCurrent Statistics:")
            print(f"   Documents: {stats.get('document_count', 0)}")
            print(f"   Size: {stats.get('size_in_bytes', 0):,} bytes")
    else:
        print(f"Creating new index: {opensearch_client.index_name}")
        
        # Create the index with our custom mapping
        success = opensearch_client.create_index()
        
        if success:
            print(f"✓ Index created successfully!")
        else:
            print(f"✗ Index creation failed")
            
except Exception as e:
    print(f"✗ Error with index management: {e}")

In [None]:
# Verify Data Pipeline Results
print("VERIFYING DATA PIPELINE")
print("=" * 40)

stats = opensearch_client.get_index_stats()

if stats and 'error' not in stats:
    doc_count = stats.get('document_count', 0)
    
    if doc_count > 0:
        print(f"✓ Success! Found {doc_count} documents in OpenSearch")
        
        # Show sample papers
        sample = opensearch_client.search_papers("*", size=3)
        if sample.get('hits'):
            print(f"\nSample papers:")
            for i, paper in enumerate(sample['hits'], 1):
                title = paper.get('title', 'Unknown')[:60]
                print(f"  {i}. {title}...")
    else:
        print("⚠️  No documents in OpenSearch yet")
        print("\nPlease run the Airflow DAG first (see instructions above)")
else:
    print("✗ Could not retrieve index stats")

## 4. Simple BM25 Search

Let's start with a simple search to demonstrate BM25 scoring:

In [None]:
# Simple BM25 Search
print("SIMPLE BM25 SEARCH")
print("=" * 40)

# Change this to any word from your papers
search_term = "learning"  # Try different terms!

print(f"Searching for: '{search_term}'\n")

results = opensearch_client.search_papers(
    query=search_term,
    size=5
)

if results.get('hits'):
    print(f"Found {results.get('total', 0)} total matches\n")
    
    for i, paper in enumerate(results['hits'], 1):
        print(f"{i}. {paper.get('title', 'Unknown')[:70]}...")
        print(f"   Score: {paper.get('score', 0):.2f}")
        print(f"   arXiv ID: {paper.get('arxiv_id', 'N/A')}\n")
else:
    print("No results found. Try searching for:")
    print("  • 'neural', 'model', 'algorithm'")
    print("  • Use '*' to see all papers")

## 5. Advanced OpenSearch Queries

### 5.1 Match Query



In [None]:
# Match Query - Search in title field
print("MATCH QUERY - Single Field Search")
print("=" * 40)

query = {
    "query": {
        "match": {
            "title": "machine learning"
        }
    },
    "size": 3
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Found {response['hits']['total']['value']} results\n")

for hit in response['hits']['hits']:
    print(f"Title: {hit['_source']['title'][:70]}...")

### 5.2 Multi-Match Query

Search across multiple fields simultaneously:

In [None]:
# Multi-Match Query - Search across multiple fields
print("MULTI-MATCH QUERY - Search Multiple Fields")
print("=" * 40)

query = {
    "query": {
        "multi_match": {
            "query": "AI Agents",
            "fields": ["title^2", "abstract", "authors"],  # ^2 boosts title field
            "type": "best_fields"
        }
    },
    "size": 3
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Found {response['hits']['total']['value']} results\n")

for hit in response['hits']['hits']:
    print(f"Title: {hit['_source']['title'][:70]}...")
    print(f"Score: {hit['_score']:.2f}")
    print(f"Authors: {', '.join(hit['_source']['authors'][:2])}...\n")

### 5.3 Boosting Query

Boost certain results while demoting others:

In [None]:
# Boosting Query - Promote and demote results
print("BOOSTING QUERY - Promote/Demote Results")
print("=" * 40)

query = {
    "query": {
        "boosting": {
            "positive": {
                "match": {
                    "abstract": "deep learning"
                }
            },
            "negative": {
                "match": {
                    "abstract": "multimodal"
                }
            },
            "negative_boost": 0.1  # Reduce score of negative matches
        }
    },
    "size": 3
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Query: Boost 'deep learning', demote 'survey' papers\n")
print(f"Found {response['hits']['total']['value']} results\n")

for hit in response['hits']['hits']:
    title = hit['_source']['title'][:70]
    abstract_snippet = hit['_source']['abstract'][:100]
    print(f"Title: {title}...")
    print(f"Score: {hit['_score']:.2f}")
    print(f"Abstract: {abstract_snippet}...\n")

### 5.4 Filter Query

Filter results by specific criteria (doesn't affect scoring):

In [None]:
# Filter Query - Filter by categories
print("FILTER QUERY - Category Filtering")
print("=" * 40)

query = {
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "abstract": "neural"
                    }
                }
            ],
            "filter": [
                {
                    "terms": {
                        "categories": ["cs.AI"]
                    }
                }
            ]
        }
    },
    "size": 3
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Found {response['hits']['total']['value']} results\n")

for hit in response['hits']['hits']:
    title = hit['_source']['title'][:70]
    categories = ', '.join(hit['_source']['categories'])
    print(f"Title: {title}...")
    print(f"Categories: {categories}")
    print(f"Score: {hit['_score']:.2f}\n")

### 5.5 Sorting Query

Sort results by different criteria:

In [None]:
# Sorting Query - Sort by publication date
print("SORTING QUERY - Latest Papers First")
print("=" * 40)

query = {
    "query": {
        "match_all": {}  # Get all papers
    },
    "sort": [
        {
            "published_date": {
                "order": "desc"  # Latest first
            }
        }
    ],
    "size": 5
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Query: All papers sorted by publication date (newest first)\n")

for hit in response['hits']['hits']:
    title = hit['_source']['title'][:70]
    pub_date = hit['_source']['published_date'][:10]
    print(f"Date: {pub_date} | {title}...")

### 5.6 Combined Query

Combine multiple query types for complex searches:

In [None]:
# Combined Query - Complex search with multiple criteria
print("COMBINED QUERY - Complex Search")
print("=" * 40)

query = {
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": "transformer",
                        "fields": ["title^3", "abstract"],
                        "type": "best_fields"
                    }
                }
            ],
            "filter": [
                {
                    "range": {
                        "published_date": {
                            "gte": "2024-01-01"
                        }
                    }
                }
            ],
            "should": [
                {
                    "match": {
                        "categories": "cs.AI"
                    }
                }
            ]
        }
    },
    "sort": [
        "_score",
        {"published_date": {"order": "desc"}}
    ],
    "size": 3
}

response = opensearch_client.client.search(
    index=opensearch_client.index_name,
    body=query
)

print(f"Complex Query:")
print(f"  • Must contain 'transformer' (title boosted 3x)")
print(f"  • Filter: published after 2024-01-01")
print(f"  • Prefer: cs.AI category")
print(f"  • Sort: by relevance, then date\n")

print(f"Found {response['hits']['total']['value']} results\n")

for hit in response['hits']['hits']:
    title = hit['_source']['title'][:70]
    pub_date = hit['_source']['published_date'][:10]
    score = hit['_score']
    categories = ', '.join(hit['_source']['categories'][:2])
    
    print(f"Title: {title}...")
    print(f"  Date: {pub_date} | Score: {score:.2f}")
    print(f"  Categories: {categories}\n")