# üîç Testeur de Recherche S√©mantique

Ce notebook permet de tester la recherche s√©mantique sur les embeddings de films.

**Objectif:** Debugger pourquoi la recherche s√©mantique ne retourne pas de r√©sultats.

## 1. Setup et Configuration

In [None]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from pathlib import Path
from dotenv import load_dotenv

# Load environment
load_dotenv()

# Configuration
PROJECT_ROOT = Path("C:/Users/Vincent/GitHub/Vincent-20-100/Agentic_Systems_Project_Vlamy")
CHROMA_PATH = str(PROJECT_ROOT / "data" / "vector_database")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print(f"üìÇ ChromaDB Path: {CHROMA_PATH}")
print(f"üîë API Key found: {bool(OPENAI_API_KEY)}")
print(f"üîë API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:] if OPENAI_API_KEY else 'None'}")

## 2. Connexion √† ChromaDB

In [None]:
# Connect to ChromaDB
print("üîå Connexion √† ChromaDB...")

try:
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    print("‚úÖ Client ChromaDB cr√©√©")
    
    # Create OpenAI embedding function
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=OPENAI_API_KEY,
        model_name="text-embedding-3-small"
    )
    print("‚úÖ Fonction d'embedding OpenAI cr√©√©e")
    
    # Get collection
    collection = client.get_or_create_collection(
        name="movie_descriptions",
        embedding_function=openai_ef
    )
    print(f"‚úÖ Collection 'movie_descriptions' charg√©e")
    
except Exception as e:
    print(f"‚ùå Erreur: {e}")
    raise

## 3. Statistiques de la Collection

In [None]:
# Get collection stats
print("\n" + "="*60)
print("üìä STATISTIQUES DE LA COLLECTION")
print("="*60)

count = collection.count()
print(f"\nüì¶ Nombre total de documents: {count}")

if count == 0:
    print("\n‚ö†Ô∏è ALERTE: La collection est VIDE!")
    print("   Vous devez d'abord embedder vos films avec embedding_manager.py")
else:
    print(f"‚úÖ Collection contient {count} films")
    
    # Peek at first few items
    print("\nüîç Aper√ßu des 3 premiers documents:")
    peek = collection.peek(limit=3)
    
    for i in range(len(peek['ids'])):
        print(f"\n   [{i+1}] ID: {peek['ids'][i]}")
        print(f"       Title: {peek['metadatas'][i].get('title', 'N/A')}")
        print(f"       Database: {peek['metadatas'][i].get('database', 'N/A')}")
        print(f"       Table: {peek['metadatas'][i].get('table', 'N/A')}")
        print(f"       Description: {peek['documents'][i][:100]}...")

## 4. Fonction de Test de Query

In [None]:
def test_query(query_text: str, n_results: int = 5, where_filter: dict = None):
    """
    Test une query s√©mantique et affiche les r√©sultats
    """
    print("\n" + "="*80)
    print(f"üîç QUERY: '{query_text}'")
    print("="*80)
    
    if where_filter:
        print(f"üéØ Filtre: {where_filter}")
    
    try:
        # Execute query
        results = collection.query(
            query_texts=[query_text],
            n_results=n_results,
            where=where_filter
        )
        
        # Check if we got results
        if not results['ids'] or len(results['ids'][0]) == 0:
            print("\n‚ùå AUCUN R√âSULTAT TROUV√â")
            print("\nPossibles causes:")
            print("  1. La collection est vide")
            print("  2. Le filtre est trop restrictif")
            print("  3. Pas de films correspondants")
            return None
        
        # Display results
        print(f"\n‚úÖ {len(results['ids'][0])} r√©sultats trouv√©s\n")
        
        for i in range(len(results['ids'][0])):
            distance = results['distances'][0][i] if 'distances' in results else None
            similarity = (1 - distance) * 100 if distance is not None else None
            
            print(f"\n{'‚îÄ'*80}")
            print(f"üé¨ R√âSULTAT #{i+1}")
            if similarity is not None:
                print(f"üìä Similarit√©: {similarity:.1f}% (distance: {distance:.4f})")
            print(f"üÜî ID: {results['ids'][0][i]}")
            print(f"üìΩÔ∏è Titre: {results['metadatas'][0][i].get('title', 'N/A')}")
            print(f"üíæ Database: {results['metadatas'][0][i].get('database', 'N/A')}")
            print(f"üìä Table: {results['metadatas'][0][i].get('table', 'N/A')}")
            print(f"\nüìù Description:")
            print(f"   {results['documents'][0][i]}")
        
        print(f"\n{'='*80}\n")
        
        return results
        
    except Exception as e:
        print(f"\n‚ùå ERREUR lors de la query: {e}")
        import traceback
        traceback.print_exc()
        return None

print("‚úÖ Fonction test_query() d√©finie")

## 5. Tests de Queries Simples

In [None]:
# Test 1: Query tr√®s simple et g√©n√©rique
test_query("action movie", n_results=3)

In [None]:
# Test 2: Query sur l'espace (comme dans votre exemple)
test_query("space action adventure science fiction", n_results=5)

In [None]:
# Test 3: Query descriptive
test_query("A detective investigating a murder mystery", n_results=5)

In [None]:
# Test 4: Romance
test_query("romantic love story", n_results=5)

## 6. Tests avec Filtres

In [None]:
# Test avec filtre sur une table sp√©cifique
test_query(
    "action movie",
    n_results=5,
    where_filter={"table": "netflix_titles"}
)

## 7. V√©rification des M√©tadonn√©es

In [None]:
# Get a sample of documents to check metadata structure
print("\n" + "="*60)
print("üîç V√âRIFICATION DES M√âTADONN√âES")
print("="*60)

sample = collection.get(limit=10)

print(f"\nüìä √âchantillon de {len(sample['ids'])} documents:\n")

# Check which tables/databases are present
tables = set()
databases = set()

for metadata in sample['metadatas']:
    if 'table' in metadata:
        tables.add(metadata['table'])
    if 'database' in metadata:
        databases.add(metadata['database'])

print(f"üìã Tables pr√©sentes: {sorted(list(tables))}")
print(f"üíæ Databases pr√©sentes: {sorted(list(databases))}")

# Display sample metadata
print(f"\nüîç Exemple de m√©tadonn√©es (premier document):\n")
if sample['metadatas']:
    print(json.dumps(sample['metadatas'][0], indent=2))

## 8. Test de Query Personnalis√©e

In [None]:
# Testez vos propres queries ici
custom_query = "horror movie haunted house"  # Modifiez cette query
test_query(custom_query, n_results=5)

## 9. Test de la Fonction du Tool (comme dans albert_v7)

In [None]:
# Reproduire exactement la logique du tool semantic_search
def semantic_search_tool(query: str, n_results: int = 5, table_filter: str = None) -> str:
    """R√©plique exacte du tool dans albert_v7.py"""
    try:
        # Get or create ChromaDB collection
        os.makedirs(CHROMA_PATH, exist_ok=True)
        client = chromadb.PersistentClient(path=CHROMA_PATH)
        
        openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=OPENAI_API_KEY,
            model_name="text-embedding-3-small"
        )
        
        collection = client.get_or_create_collection(
            name="movie_descriptions",
            embedding_function=openai_ef
        )
        
        # Build filter if specified
        where_filter = None
        if table_filter:
            where_filter = {"table": table_filter}
        
        # Query collection
        results = collection.query(
            query_texts=[query],
            n_results=n_results,
            where=where_filter
        )
        
        # Format results
        formatted_results = []
        if results['ids'] and len(results['ids'][0]) > 0:
            for i in range(len(results['ids'][0])):
                formatted_results.append({
                    "id": results['ids'][0][i],
                    "title": results['metadatas'][0][i].get('title', 'Unknown'),
                    "description": results['documents'][0][i],
                    "database": results['metadatas'][0][i].get('database', 'unknown'),
                    "table": results['metadatas'][0][i].get('table', 'unknown'),
                    "similarity_score": 1 - results['distances'][0][i] if 'distances' in results else None
                })
        
        return json.dumps(formatted_results, indent=2, default=str)
    
    except Exception as e:
        return json.dumps({"error": f"Semantic search error: {str(e)}"})

# Test du tool
print("\n" + "="*60)
print("üß™ TEST DU TOOL SEMANTIC_SEARCH (comme dans albert_v7)")
print("="*60)

result_json = semantic_search_tool("space action adventure", n_results=5)
result = json.loads(result_json)

if isinstance(result, list) and len(result) > 0:
    print(f"\n‚úÖ Tool retourne {len(result)} r√©sultats\n")
    for i, movie in enumerate(result, 1):
        print(f"{i}. {movie['title']} (similarity: {movie['similarity_score']:.2%})")
elif isinstance(result, dict) and 'error' in result:
    print(f"\n‚ùå ERREUR: {result['error']}")
else:
    print(f"\n‚ùå Aucun r√©sultat")

print(f"\nüìã JSON complet:\n{result_json}")

## 10. Diagnostic Complet

In [None]:
print("\n" + "="*80)
print("üî¨ DIAGNOSTIC COMPLET")
print("="*80)

# 1. Collection stats
count = collection.count()
print(f"\n1Ô∏è‚É£ Collection Stats:")
print(f"   - Nombre de documents: {count}")
print(f"   - Collection vide: {count == 0}")

# 2. ChromaDB path
print(f"\n2Ô∏è‚É£ Paths:")
print(f"   - ChromaDB: {CHROMA_PATH}")
print(f"   - Exists: {os.path.exists(CHROMA_PATH)}")
if os.path.exists(CHROMA_PATH):
    files = os.listdir(CHROMA_PATH)
    print(f"   - Files in directory: {len(files)}")

# 3. API Key
print(f"\n3Ô∏è‚É£ OpenAI API:")
print(f"   - API Key pr√©sente: {bool(OPENAI_API_KEY)}")
print(f"   - API Key length: {len(OPENAI_API_KEY) if OPENAI_API_KEY else 0}")

# 4. Sample query
print(f"\n4Ô∏è‚É£ Test Query:")
try:
    test_results = collection.query(
        query_texts=["action"],
        n_results=1
    )
    has_results = len(test_results['ids'][0]) > 0 if test_results['ids'] else False
    print(f"   - Query 'action' retourne des r√©sultats: {has_results}")
except Exception as e:
    print(f"   - Erreur lors de la query: {e}")

# 5. Conclusion
print(f"\n" + "="*80)
if count == 0:
    print("‚ö†Ô∏è PROBL√àME: Collection vide!")
    print("   ‚Üí Vous devez embedder vos films avec embedding_manager.py")
elif not OPENAI_API_KEY:
    print("‚ö†Ô∏è PROBL√àME: Pas d'API Key OpenAI!")
    print("   ‚Üí V√©rifiez votre fichier .env")
else:
    print("‚úÖ Tout semble correct!")
    print("   ‚Üí La recherche s√©mantique devrait fonctionner")
print("="*80)