In [15]:
# Importazione delle librerie necessarie per il grafo di conoscenza
import pandas as pd
import json
from pathlib import Path
from neo4j import GraphDatabase
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Librerie importate con successo per la creazione del grafo di conoscenza")

Librerie importate con successo per la creazione del grafo di conoscenza


In [16]:
# Configurazione della connessione a Neo4j
# Avviarlo con Docker: docker run -p 7474:7474 -p 7687:7687 neo4j:latest

NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"  # Cambia con la tua password


class KnowledgeGraphBuilder:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def clear_database(self):
        """Pulisce il database per ricominciare da capo"""
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            logger.info("Database pulito")

    def create_indexes(self):
        """Crea gli indici per migliorare le performance"""
        with self.driver.session() as session:
            # Indici per i prodotti
            session.run(
                "CREATE INDEX product_id IF NOT EXISTS FOR (p:Product) ON (p.product_id)"
            )
            session.run(
                "CREATE INDEX product_name IF NOT EXISTS FOR (p:Product) ON (p.name)"
            )

            # Indici per i documenti
            session.run(
                "CREATE INDEX document_filename IF NOT EXISTS FOR (d:Document) ON (d.filename)"
            )

            # Indici per le annotazioni
            session.run(
                "CREATE INDEX annotation_filename IF NOT EXISTS FOR (a:Annotation) ON (a.filename)"
            )

            logger.info("Indici creati")


print("Classe KnowledgeGraphBuilder definita")


Classe KnowledgeGraphBuilder definita


In [17]:
# Caricamento e preparazione dei dati CSV
def load_csv_data():
    """Carica tutti i file CSV da AdventureWorks"""
    data_path = Path("../data")

    # Leggiamo i CSV principali
    products_df = pd.read_csv(data_path / "Product.csv", sep=";")
    categories_df = pd.read_csv(data_path / "ProductCategory.csv", sep=";")
    descriptions_df = pd.read_csv(data_path / "ProductDescription.csv", sep=";")
    models_df = pd.read_csv(data_path / "ProductModel.csv", sep=";")

    logger.info(f"Caricati {len(products_df)} prodotti")
    logger.info(f"Caricate {len(categories_df)} categorie")
    logger.info(f"Caricate {len(descriptions_df)} descrizioni")
    logger.info(f"Caricati {len(models_df)} modelli")

    return {
        "products": products_df,
        "categories": categories_df,
        "descriptions": descriptions_df,
        "models": models_df,
    }


# Carica i dati
csv_data = load_csv_data()

# Esplora i primi record per vedere la struttura
print("\\nPrimi 3 prodotti:")
print(
    csv_data["products"][
        ["ProductID", "Name", "ProductCategoryID", "ProductModelID"]
    ].head(3)
)

print("\\nPrime 5 categorie:")
print(
    csv_data["categories"][
        ["ProductCategoryID", "ParentProductCategoryID", "Name"]
    ].head()
)


INFO:__main__:Caricati 100 prodotti
INFO:__main__:Caricate 41 categorie
INFO:__main__:Caricate 100 descrizioni
INFO:__main__:Caricati 100 modelli


\nPrimi 3 prodotti:
   ProductID                       Name  ProductCategoryID  ProductModelID
0        680  HL Road Frame - Black, 58                 18               6
1        706    HL Road Frame - Red, 58                 18               6
2        707      Sport-100 Helmet, Red                 35              33
\nPrime 5 categorie:
   ProductCategoryID  ParentProductCategoryID            Name
0                  1                      NaN           Bikes
1                  2                      NaN      Components
2                  3                      NaN        Clothing
3                  4                      NaN     Accessories
4                  5                      1.0  Mountain Bikes


In [18]:
# Analisi dei documenti e annotazioni in IngestedDocuments
def analyze_ingested_documents():
    """Analizza la struttura dei documenti e delle loro annotazioni"""
    docs_path = Path("../data/IngestedDocuments")

    # Raggruppa i file per base name (senza estensione e senza suffissi)
    files = list(docs_path.glob("*"))
    documents = {}

    for file in files:
        # Estrae il nome del documento
        name = file.name

        # Identifica il tipo di file
        if name.endswith(".pdf"):
            base_name = name.replace(".pdf", "")
            if base_name not in documents:
                documents[base_name] = {"pdf": None, "annotations": []}
            documents[base_name]["pdf"] = file

        elif name.endswith(".jpg"):
            # Rimuove " Fig X" dalla fine se presente
            base_name = name.replace(".jpg", "")
            if " Fig " in base_name:
                base_name = base_name.split(" Fig ")[0]
            if base_name not in documents:
                documents[base_name] = {"pdf": None, "annotations": []}
            documents[base_name]["annotations"].append(file)

        elif name.endswith(".json"):
            # Rimuove " Table X" dalla fine se presente
            base_name = name.replace(".json", "")
            if " Table " in base_name:
                base_name = base_name.split(" Table ")[0]
            if base_name not in documents:
                documents[base_name] = {"pdf": None, "annotations": []}
            documents[base_name]["annotations"].append(file)

    logger.info(f"Trovati {len(documents)} gruppi di documenti")

    for doc_name, doc_data in documents.items():
        print(f"\\nDocumento: {doc_name}")
        print(f"  PDF: {'Sì' if doc_data['pdf'] else 'Mancante'}")
        print(f"  Annotazioni: {len(doc_data['annotations'])}")
        for ann in doc_data["annotations"]:
            print(f"    - {ann.name}")

    return documents


# Analizza i documenti
document_structure = analyze_ingested_documents()


INFO:__main__:Trovati 4 gruppi di documenti


\nDocumento: Vintage Trailblazer X-1 Mountain Bike (1995)
  PDF: Sì
  Annotazioni: 3
    - Vintage Trailblazer X-1 Mountain Bike (1995) Table 1.json
    - Vintage Trailblazer X-1 Mountain Bike (1995).jpg
    - Vintage Trailblazer X-1 Mountain Bike (1995) Table 2.json
\nDocumento: LL Mountain Handlebars (Black)
  PDF: Sì
  Annotazioni: 2
    - LL Mountain Handlebars (Black) Table 1.json
    - LL Mountain Handlebars (Black) Fig 1.jpg
\nDocumento: Long-Sleeve Logo Jersey (M)
  PDF: Sì
  Annotazioni: 2
    - Long-Sleeve Logo Jersey (M) Table 1.json
    - Long-Sleeve Logo Jersey (M) Fig 1.jpg
\nDocumento: Mountain Bike Manual
  PDF: Sì
  Annotazioni: 2
    - Mountain Bike Manual Table 1.json
    - Mountain Bike Manual Table 2.json


In [19]:
# Estensione della classe KnowledgeGraphBuilder con metodi per creare i nodi
class KnowledgeGraphBuilder(KnowledgeGraphBuilder):
    def create_product_nodes(self, products_df, categories_df, models_df):
        """Crea i nodi per i prodotti con le loro proprietà"""
        with self.driver.session() as session:
            for _, product in products_df.iterrows():
                try:
                    # Trova la categoria (gestisce i valori NaN)
                    category_name = "Unknown"
                    if pd.notna(product.get("ProductCategoryID")):
                        category = categories_df[
                            categories_df["ProductCategoryID"]
                            == product["ProductCategoryID"]
                        ]
                        if not category.empty:
                            category_name = str(category["Name"].iloc[0])

                    # Trova il modello (gestisce i valori NaN)
                    model_name = "Unknown"
                    if pd.notna(product.get("ProductModelID")):
                        model = models_df[
                            models_df["ProductModelID"] == product["ProductModelID"]
                        ]
                        if not model.empty:
                            model_name = str(model["Name"].iloc[0])

                    # Crea il nodo prodotto
                    query = """
                    CREATE (p:Product {
                        product_id: $product_id,
                        name: $name,
                        product_number: $product_number,
                        color: $color,
                        standard_cost: $standard_cost,
                        list_price: $list_price,
                        size: $size,
                        weight: $weight,
                        category_id: $category_id,
                        category_name: $category_name,
                        model_id: $model_id,
                        model_name: $model_name,
                        sell_start_date: $sell_start_date
                    })
                    """

                    # Prepara i parametri con gestione sicura dei tipi
                    params = {
                        "product_id": int(product["ProductID"]),
                        "name": str(product["Name"]),
                        "product_number": str(product["ProductNumber"]),
                        "color": str(
                            product.get("Color", "")
                            if pd.notna(product.get("Color"))
                            else ""
                        ),
                        "standard_cost": float(
                            str(product.get("StandardCost", "0")).replace(",", ".")
                        )
                        if pd.notna(product.get("StandardCost"))
                        else 0.0,
                        "list_price": float(
                            str(product.get("ListPrice", "0")).replace(",", ".")
                        )
                        if pd.notna(product.get("ListPrice"))
                        else 0.0,
                        "size": str(
                            product.get("Size", "")
                            if pd.notna(product.get("Size"))
                            else ""
                        ),
                        "weight": str(
                            product.get("Weight", "")
                            if pd.notna(product.get("Weight"))
                            else ""
                        ),
                        "category_id": int(product["ProductCategoryID"])
                        if pd.notna(product.get("ProductCategoryID"))
                        else None,
                        "category_name": category_name,
                        "model_id": int(product["ProductModelID"])
                        if pd.notna(product.get("ProductModelID"))
                        else None,
                        "model_name": model_name,
                        "sell_start_date": str(
                            product.get("SellStartDate", "")
                            if pd.notna(product.get("SellStartDate"))
                            else ""
                        ),
                    }

                    session.run(query, **params)

                except Exception as e:
                    logger.error(
                        f"Errore nel creare il nodo per il prodotto {product.get('ProductID', 'Unknown')}: {e}"
                    )
                    continue

            logger.info(f"Creati {len(products_df)} nodi Product")


print("Metodo create_product_nodes aggiunto alla classe")


Metodo create_product_nodes aggiunto alla classe


In [20]:
# Aggiunta di metodi per creare documenti e annotazioni
class KnowledgeGraphBuilder(KnowledgeGraphBuilder):
    def create_document_nodes(self, document_structure):
        """Crea i nodi per i documenti PDF e le loro annotazioni"""
        with self.driver.session() as session:
            for doc_name, doc_data in document_structure.items():
                # Crea il nodo documento PDF se esiste
                if doc_data["pdf"]:
                    pdf_file = doc_data["pdf"]

                    query = """
                    CREATE (d:Document {
                        filename: $filename,
                        document_name: $document_name,
                        file_path: $file_path,
                        file_type: 'PDF',
                        file_size: $file_size
                    })
                    """

                    session.run(
                        query,
                        filename=pdf_file.name,
                        document_name=doc_name,
                        file_path=str(pdf_file),
                        file_size=pdf_file.stat().st_size if pdf_file.exists() else 0,
                    )

                    # Crea i nodi annotazione e le relazioni
                    for annotation_file in doc_data["annotations"]:
                        ann_type = (
                            "Image" if annotation_file.suffix == ".jpg" else "Table"
                        )

                        # Leggi il contenuto se è un JSON
                        content = None
                        if annotation_file.suffix == ".json":
                            try:
                                with open(annotation_file, "r") as f:
                                    content = json.load(f)
                            except Exception as e:
                                logger.warning(
                                    f"Errore nel leggere {annotation_file}: {e}"
                                )

                        # Crea il nodo annotazione
                        ann_query = """
                        CREATE (a:Annotation {
                            filename: $filename,
                            annotation_type: $annotation_type,
                            file_path: $file_path,
                            content: $content,
                            file_size: $file_size
                        })
                        """

                        session.run(
                            ann_query,
                            filename=annotation_file.name,
                            annotation_type=ann_type,
                            file_path=str(annotation_file),
                            content=json.dumps(content) if content else None,
                            file_size=annotation_file.stat().st_size
                            if annotation_file.exists()
                            else 0,
                        )

                        # Crea la relazione ANNOTATION tra documento e annotazione
                        rel_query = """
                        MATCH (d:Document {filename: $doc_filename})
                        MATCH (a:Annotation {filename: $ann_filename})
                        CREATE (a)-[:ANNOTATION]->(d)
                        """

                        session.run(
                            rel_query,
                            doc_filename=pdf_file.name,
                            ann_filename=annotation_file.name,
                        )

            logger.info(
                f"Creati nodi per {len(document_structure)} documenti e relative annotazioni"
            )


print("Metodo create_document_nodes aggiunto alla classe")


Metodo create_document_nodes aggiunto alla classe


In [21]:
# Aggiunta di metodi per creare relazioni tra prodotti
class KnowledgeGraphBuilder(KnowledgeGraphBuilder):
    def create_product_relationships(self):
        """Crea relazioni statiche/manuali tra prodotti"""
        with self.driver.session() as session:
            # 1. Relazioni SAME_CATEGORY tra prodotti della stessa categoria
            category_query = """
            MATCH (p1:Product), (p2:Product)
            WHERE p1.category_id = p2.category_id 
            AND p1.product_id <> p2.product_id
            AND p1.category_id IS NOT NULL
            CREATE (p1)-[:SAME_CATEGORY]->(p2)
            """
            result1 = session.run(category_query)
            logger.info("Relazioni SAME_CATEGORY create")

            # 2. Relazioni SAME_MODEL tra prodotti dello stesso modello
            model_query = """
            MATCH (p1:Product), (p2:Product)
            WHERE p1.model_id = p2.model_id 
            AND p1.product_id <> p2.product_id
            AND p1.model_id IS NOT NULL
            CREATE (p1)-[:SAME_MODEL]->(p2)
            """
            result2 = session.run(model_query)
            logger.info("Relazioni SAME_MODEL create")

            # 3. Relazioni SIMILAR_PRICE tra prodotti con prezzi simili (±20%)
            price_query = """
            MATCH (p1:Product), (p2:Product)
            WHERE p1.product_id <> p2.product_id
            AND p1.list_price > 0 AND p2.list_price > 0
            AND abs(p1.list_price - p2.list_price) / p1.list_price <= 0.20
            CREATE (p1)-[:SIMILAR_PRICE]->(p2)
            """
            result3 = session.run(price_query)
            logger.info("Relazioni SIMILAR_PRICE create")

            # 4. Relazioni manuali specifiche per prodotti molto correlati
            # Esempio: tutti i frame road sono correlati
            manual_relations = [
                {
                    "filter1": "p1.name CONTAINS 'Road Frame'",
                    "filter2": "p2.name CONTAINS 'Road Frame'",
                    "relation": "COMPATIBLE_PRODUCT",
                },
                {
                    "filter1": "p1.name CONTAINS 'Mountain'",
                    "filter2": "p2.name CONTAINS 'Mountain'",
                    "relation": "COMPATIBLE_PRODUCT",
                },
                {
                    "filter1": "p1.name CONTAINS 'Helmet'",
                    "filter2": "p2.name CONTAINS 'Jersey'",
                    "relation": "COMPLEMENTARY_PRODUCT",
                },
                {
                    "filter1": "p1.name CONTAINS 'Frame'",
                    "filter2": "p2.name CONTAINS 'Handlebars'",
                    "relation": "COMPLEMENTARY_PRODUCT",
                },
            ]

            for relation in manual_relations:
                manual_query = f"""
                MATCH (p1:Product), (p2:Product)
                WHERE {relation["filter1"]}
                AND {relation["filter2"]}
                AND p1.product_id <> p2.product_id
                CREATE (p1)-[:{relation["relation"]}]->(p2)
                """
                session.run(manual_query)
                logger.info(f"Relazioni {relation['relation']} create")


print("Metodo create_product_relationships aggiunto alla classe")


Metodo create_product_relationships aggiunto alla classe


In [22]:
# Metodi per collegare prodotti con documenti e query del grafo
class KnowledgeGraphBuilder(KnowledgeGraphBuilder):
    def create_product_document_relationships(self):
        """Crea relazioni tra prodotti e documenti basate sui nomi"""
        with self.driver.session() as session:
            # Cerca di collegare prodotti con documenti basandosi sui nomi
            # Ad esempio, "LL Mountain Handlebars" dovrebbe collegarsi con il documento PDF

            connect_query = """
            MATCH (p:Product), (d:Document)
            WHERE d.document_name CONTAINS p.name 
            OR p.name CONTAINS d.document_name
            OR (d.document_name CONTAINS 'Mountain' AND p.name CONTAINS 'Mountain')
            OR (d.document_name CONTAINS 'Handlebars' AND p.name CONTAINS 'Handlebars')
            OR (d.document_name CONTAINS 'Jersey' AND p.name CONTAINS 'Jersey')
            CREATE (p)-[:DESCRIBED_BY]->(d)
            """

            session.run(connect_query)
            logger.info("Relazioni DESCRIBED_BY tra prodotti e documenti create")

    def get_graph_statistics(self):
        """Restituisce statistiche sul grafo creato"""
        with self.driver.session() as session:
            stats = {}

            # Conta i nodi per tipo
            node_counts = session.run("""
            MATCH (n)
            RETURN labels(n)[0] as node_type, count(n) as count
            ORDER BY count DESC
            """)

            stats["nodes"] = {
                record["node_type"]: record["count"] for record in node_counts
            }

            # Conta le relazioni per tipo
            rel_counts = session.run("""
            MATCH ()-[r]->()
            RETURN type(r) as relationship_type, count(r) as count
            ORDER BY count DESC
            """)

            stats["relationships"] = {
                record["relationship_type"]: record["count"] for record in rel_counts
            }

            return stats

    def query_similar_products(self, product_id, limit=5):
        """Trova prodotti simili a quello specificato"""
        with self.driver.session() as session:
            query = """
            MATCH (p:Product {product_id: $product_id})-[r]-(similar:Product)
            RETURN similar.name as product_name, 
                   similar.product_id as product_id,
                   type(r) as relationship_type,
                   similar.list_price as price
            ORDER BY similar.list_price
            LIMIT $limit
            """

            result = session.run(query, product_id=product_id, limit=limit)
            return [dict(record) for record in result]


print("Metodi finali aggiunti alla classe KnowledgeGraphBuilder")


Metodi finali aggiunti alla classe KnowledgeGraphBuilder


In [23]:
# COSTRUZIONE DEL GRAFO DI CONOSCENZA

print("=== COSTRUZIONE DEL GRAFO DI CONOSCENZA ===")
print()
print("🔄 Questo script:")
print("   1. Legge i dati dai tuoi file CSV")
print("   2. Li carica in Neo4j come nodi del grafo")
print("   3. Crea relazioni intelligenti tra i nodi")
print("   4. Risultato: una knowledge base interrogabile!")
print()
print("🌐 URL Browser Neo4j: http://localhost:7474")
print("🔑 Login: neo4j / password")
print()


# Test della connessione prima di procedere
def test_neo4j_connection():
    try:
        test_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        with test_driver.session() as session:
            result = session.run("RETURN 'Neo4j is connected!' as message")
            message = result.single()["message"]
            test_driver.close()
            return True, message
    except Exception as e:
        return False, str(e)


connected, message = test_neo4j_connection()

if connected:
    print(f"✅ {message}")
    build_graph = True  # Automaticamente procediamo se la connessione funziona
else:
    print(f"❌ Errore di connessione: {message}")
    print()
    print("🐳 Per avviare Neo4j con Docker:")
    print(
        "docker run -p 7474:7474 -p 7687:7687 -d --env NEO4J_AUTH=neo4j/password neo4j:latest"
    )
    print()
    print("Poi riavvia questa cella!")
    build_graph = False

if build_graph:
    try:
        # Inizializza il builder
        kg_builder = KnowledgeGraphBuilder(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

        print("1. Pulizia del database...")
        kg_builder.clear_database()

        print("2. Creazione degli indici...")
        kg_builder.create_indexes()

        print("3. Creazione dei nodi prodotto...")
        kg_builder.create_product_nodes(
            csv_data["products"], csv_data["categories"], csv_data["models"]
        )

        print("4. Creazione dei nodi documento e annotazioni...")
        kg_builder.create_document_nodes(document_structure)

        print("5. Creazione delle relazioni tra prodotti...")
        kg_builder.create_product_relationships()

        print("6. Collegamento prodotti con documenti...")
        kg_builder.create_product_document_relationships()

        print("7. Statistiche del grafo creato:")
        stats = kg_builder.get_graph_statistics()
        print(f"   Nodi: {stats['nodes']}")
        print(f"   Relazioni: {stats['relationships']}")

        print("\\n=== GRAFO COMPLETATO! ===")
        print(
            "Puoi ora esplorare il grafo usando Neo4j Browser su http://localhost:7474"
        )
        print()
        print("🔍 Esempi di query Cypher per esplorare il grafo:")
        print("MATCH (n) RETURN count(n) as total_nodes")
        print("MATCH (p:Product) RETURN p.name, p.category_name LIMIT 10")
        print(
            "MATCH (d:Document)<-[:ANNOTATION]-(a:Annotation) RETURN d.document_name, count(a) as annotations"
        )

        kg_builder.close()

    except Exception as e:
        print(f"ERRORE: {e}")
        print(
            "Verifica che Neo4j sia in esecuzione e che le credenziali siano corrette."
        )
else:
    print("⚠️ Neo4j non è connesso.")
    print("🐳 Per avviare Neo4j con Docker:")
    print(
        "docker run -p 7474:7474 -p 7687:7687 -d --env NEO4J_AUTH=neo4j/password neo4j:latest"
    )


=== COSTRUZIONE DEL GRAFO DI CONOSCENZA ===

🔄 Questo script:
   1. Legge i dati dai tuoi file CSV
   2. Li carica in Neo4j come nodi del grafo
   3. Crea relazioni intelligenti tra i nodi
   4. Risultato: una knowledge base interrogabile!

🌐 URL Browser Neo4j: http://localhost:7474
🔑 Login: neo4j / password

✅ Neo4j is connected!
1. Pulizia del database...


INFO:__main__:Database pulito
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE RANGE INDEX product_id IF NOT EXISTS FOR (e:Product) ON (e.product_id)` has no effect.} {description: `RANGE INDEX product_id FOR (e:Product) ON (e.product_id)` already exists.} {position: None} for query: 'CREATE INDEX product_id IF NOT EXISTS FOR (p:Product) ON (p.product_id)'
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE RANGE INDEX product_name IF NOT EXISTS FOR (e:Product) ON (e.name)` has no effect.} {description: `RANGE INDEX product_name FOR (e:Product) ON (e.name)` already exists.} {position: None} for query: 'CREATE INDEX product_name IF NOT EXISTS FOR (p:Product) ON (p.name)'
INFO:neo4j.notifications:Received no

2. Creazione degli indici...
3. Creazione dei nodi prodotto...


INFO:__main__:Creati 100 nodi Product
INFO:__main__:Creati nodi per 4 documenti e relative annotazioni
INFO:__main__:Relazioni SAME_CATEGORY create
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (p2))} {position: line: 2, column: 13, offset: 13} for query: '\n            MATCH (p1:Product), (p2:Product)\n            WHERE p1.category_id = p2.category_id \n     

4. Creazione dei nodi documento e annotazioni...
5. Creazione delle relazioni tra prodotti...


INFO:__main__:Relazioni SAME_MODEL create
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (p2))} {position: line: 2, column: 13, offset: 13} for query: '\n            MATCH (p1:Product), (p2:Product)\n            WHERE p1.model_id = p2.model_id \n            AND p1.product_id <> p2.product_id\n            AND p1.model_id IS NOT NULL\n            CREATE (p1)-[:SA

6. Collegamento prodotti con documenti...
7. Statistiche del grafo creato:
   Nodi: {'Product': 61, 'Annotation': 9, 'Document': 4}
   Relazioni: {'SAME_CATEGORY': 1510, 'SIMILAR_PRICE': 572, 'COMPATIBLE_PRODUCT': 380, 'SAME_MODEL': 354, 'DESCRIBED_BY': 60, 'ANNOTATION': 9}
\n=== GRAFO COMPLETATO! ===
Puoi ora esplorare il grafo usando Neo4j Browser su http://localhost:7474

🔍 Esempi di query Cypher per esplorare il grafo:
MATCH (n) RETURN count(n) as total_nodes
MATCH (p:Product) RETURN p.name, p.category_name LIMIT 10
MATCH (d:Document)<-[:ANNOTATION]-(a:Annotation) RETURN d.document_name, count(a) as annotations


In [24]:
# MATCH (n)-[r:DESCRIBED_BY|ANNOTATION]-(m)
# RETURN n, r, m

In [25]:
# ESEMPI DI QUERY SUL GRAFO
# Questi esempi funzionano solo dopo aver costruito il grafo


def demo_queries():
    """Esegue alcune query di esempio sul grafo"""
    try:
        kg_builder = KnowledgeGraphBuilder(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

        print("=== ESEMPI DI QUERY SUL GRAFO ===\\n")

        # 1. Trova un prodotto e i suoi simili
        print("1. Prodotti simili al primo helmet:")
        with kg_builder.driver.session() as session:
            # Trova il primo helmet
            helmet_result = session.run("""
            MATCH (p:Product) 
            WHERE p.name CONTAINS 'Helmet' 
            RETURN p.product_id as id, p.name as name 
            LIMIT 1
            """)

            helmet = helmet_result.single()
            if helmet:
                print(f"   Prodotto base: {helmet['name']} (ID: {helmet['id']})")

                # Trova prodotti simili
                similar = kg_builder.query_similar_products(helmet["id"], 3)
                for product in similar:
                    print(
                        f"   → {product['product_name']} ({product['relationship_type']}) - €{product['price']}"
                    )
            else:
                print("   Nessun helmet trovato")

        print("\\n2. Documenti e le loro annotazioni:")
        with kg_builder.driver.session() as session:
            docs_result = session.run("""
            MATCH (d:Document)<-[:ANNOTATION]-(a:Annotation)
            RETURN d.document_name as doc_name, 
                   collect(a.annotation_type) as annotation_types,
                   count(a) as annotation_count
            """)

            for record in docs_result:
                print(
                    f"   📄 {record['doc_name']}: {record['annotation_count']} annotazioni ({', '.join(record['annotation_types'])})"
                )

        print("\\n3. Prodotti per categoria:")
        with kg_builder.driver.session() as session:
            category_result = session.run("""
            MATCH (p:Product)
            WHERE p.category_name IS NOT NULL
            RETURN p.category_name as category, count(p) as product_count
            ORDER BY product_count DESC
            LIMIT 5
            """)

            for record in category_result:
                print(f"   🏷️ {record['category']}: {record['product_count']} prodotti")

        print("\\n4. Relazioni prodotto-documento:")
        with kg_builder.driver.session() as session:
            rel_result = session.run("""
            MATCH (p:Product)-[:DESCRIBED_BY]->(d:Document)
            RETURN p.name as product_name, d.document_name as document_name
            LIMIT 5
            """)

            for record in rel_result:
                print(
                    f"   📦 {record['product_name']} ← → 📄 {record['document_name']}"
                )

        kg_builder.close()

        print("\\n=== QUERY CYPHER UTILI ===")
        print("Puoi eseguire queste query nel Neo4j Browser (http://localhost:7474):")
        print()
        print("# Visualizza tutto il grafo (attenzione con grafi grandi!):")
        print("MATCH (n)-[r]-(m) RETURN n,r,m LIMIT 50")
        print()
        print("# Trova prodotti di una categoria specifica:")
        print("MATCH (p:Product) WHERE p.category_name = 'Road Bikes' RETURN p")
        print()
        print("# Trova documenti con le loro annotazioni:")
        print("MATCH (d:Document)<-[:ANNOTATION]-(a:Annotation) RETURN d,a")
        print()
        print("# Trova percorsi tra due prodotti:")
        print(
            "MATCH path = (p1:Product)-[*1..3]-(p2:Product) WHERE p1.name CONTAINS 'Frame' AND p2.name CONTAINS 'Handlebars' RETURN path LIMIT 10"
        )

    except Exception as e:
        print(f"ERRORE nelle query: {e}")
        print(
            "Assicurati che il grafo sia stato costruito e che Neo4j sia in esecuzione."
        )


# Esegui le query demo solo se il grafo è già stato costruito
try:
    # Test rapido di connessione
    test_kg = KnowledgeGraphBuilder(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
    with test_kg.driver.session() as session:
        result = session.run("MATCH (n) RETURN count(n) as node_count").single()
        if result["node_count"] > 0:
            test_kg.close()
            demo_queries()
        else:
            test_kg.close()
            print(
                "Il grafo sembra vuoto. Costruisci prima il grafo eseguendo la cella precedente con build_graph = True"
            )
except:
    print("Neo4j non è raggiungibile o il grafo non è stato ancora costruito.")


=== ESEMPI DI QUERY SUL GRAFO ===\n
1. Prodotti simili al primo helmet:
   Prodotto base: Sport-100 Helmet, Black (ID: 708)
   → Sport-100 Helmet, Red (SIMILAR_PRICE) - €34.99
   → Sport-100 Helmet, Red (SIMILAR_PRICE) - €34.99
   → Sport-100 Helmet, Blue (SIMILAR_PRICE) - €34.99
\n2. Documenti e le loro annotazioni:
   📄 Vintage Trailblazer X-1 Mountain Bike (1995): 3 annotazioni (Table, Image, Table)
   📄 LL Mountain Handlebars (Black): 2 annotazioni (Table, Image)
   📄 Long-Sleeve Logo Jersey (M): 2 annotazioni (Table, Image)
   📄 Mountain Bike Manual: 2 annotazioni (Table, Table)
\n3. Prodotti per categoria:
   🏷️ Road Bikes: 35 prodotti
   🏷️ Mountain Bikes: 18 prodotti
   🏷️ Helmets: 3 prodotti
   🏷️ Forks: 3 prodotti
   🏷️ Socks: 2 prodotti
\n4. Relazioni prodotto-documento:
   📦 Mountain-300 Black, 38 ← → 📄 Vintage Trailblazer X-1 Mountain Bike (1995)
   📦 Mountain-200 Silver, 42 ← → 📄 Vintage Trailblazer X-1 Mountain Bike (1995)
   📦 Mountain-300 Black, 48 ← → 📄 Vintage Trailb

In [26]:
# EMBEDDINGS
# This system automatically adapts to any table structure and generates embeddings

import numpy as np
import pickle
from sentence_transformers import SentenceTransformer


class DynamicEmbeddingGenerator:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        """Initialize with a sentence transformer model"""
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.embeddings_data = {"embeddings": [], "metadata": [], "texts": []}

    def analyze_column_content(self, df, col):
        """Analyze column content to determine its characteristics"""
        non_null_values = df[col].dropna()
        if len(non_null_values) == 0:
            return {"type": "empty"}

        # Convert to strings for analysis
        str_values = non_null_values.astype(str)
        sample_values = str_values.head(10).tolist()

        # Analyze patterns in the actual data
        analysis = {
            "type": "data",
            "unique_ratio": len(non_null_values.unique()) / len(non_null_values),
            "avg_length": str_values.str.len().mean(),
            "max_length": str_values.str.len().max(),
            "has_numbers": any(any(c.isdigit() for c in val) for val in sample_values),
            "all_numeric": all(
                self._is_numeric(val) for val in sample_values if val.strip()
            ),
            "has_dates": any(self._looks_like_date(val) for val in sample_values),
            "has_urls": any(
                "http" in val.lower() or "www." in val.lower() for val in sample_values
            ),
            "has_guids": any(self._looks_like_guid(val) for val in sample_values),
            "is_boolean": all(
                val.lower() in ["true", "false", "1", "0", "yes", "no"]
                for val in sample_values
            ),
            "sample_values": sample_values[:3],  # Keep a few examples
        }

        return analysis

    def _is_numeric(self, val):
        """Check if a value is numeric"""
        try:
            float(str(val).replace(",", ""))
            return True
        except:
            return False

    def _looks_like_date(self, val):
        """Check if a value looks like a date"""
        val = str(val)
        return (
            len(val.split("-")) == 3
            or len(val.split("/")) == 3
            or len(val.split(" ")) >= 2
            and any(c.isdigit() for c in val)
        )

    def _looks_like_guid(self, val):
        """Check if a value looks like a GUID"""
        val = str(val)
        return len(val.replace("-", "")) == 32 and all(
            c in "0123456789abcdefABCDEF-" for c in val
        )

    def create_smart_text_representation(self, row, df, table_name=None):
        """Create text representation based on actual data patterns, not column names"""
        text_parts = []

        # Add table context
        if table_name:
            text_parts.append(f"Table: {table_name}")

        # Analyze and categorize fields by semantic importance
        high_importance = []  # Descriptive text, names, etc.
        medium_importance = []  # Categories, types, etc.
        low_importance = []  # IDs, technical fields, etc.

        for col in df.columns:
            value = row.get(col)
            if pd.isna(value) or str(value).strip() == "":
                continue

            analysis = self.analyze_column_content(df, col)

            # Skip empty or system fields
            if analysis["type"] == "empty" or analysis.get("has_guids", False):
                continue

            str_value = str(value).strip()
            field_text = f"{col}: {str_value}"

            # Categorize by semantic value for embeddings
            if (
                analysis.get("avg_length", 0) > 20
                and analysis.get("unique_ratio", 0) > 0.7
            ):
                # Long, unique text - likely descriptions, names
                high_importance.append(field_text)
            elif analysis.get("avg_length", 0) > 5 and not analysis.get(
                "all_numeric", False
            ):
                # Medium text, not purely numeric - likely categories, types
                medium_importance.append(field_text)
            elif analysis.get("all_numeric", False) and not analysis.get(
                "has_dates", False
            ):
                # Pure numbers - measurements, prices, quantities
                low_importance.append(field_text)
            elif analysis.get("unique_ratio", 0) > 0.9:
                # Highly unique - likely identifiers (but include for context)
                low_importance.append(field_text)
            else:
                # Everything else
                medium_importance.append(field_text)

        # Combine in order of semantic importance
        all_fields = high_importance + medium_importance + low_importance
        text_parts.extend(all_fields)

        return (
            ". ".join(text_parts) if text_parts else f"Table: {table_name}. Empty row."
        )

    def create_enhanced_text_representation(self, row, df, table_name=None):
        """Create enhanced text with additional context and relationships"""
        # Get the basic smart representation
        basic_text = self.create_smart_text_representation(row, df, table_name)

        # Add statistical context about the data
        enhanced_parts = [basic_text]

        # Add table-level context
        if df is not None:
            total_rows = len(df)
            total_cols = len(df.columns)
            enhanced_parts.append(
                f"This is one of {total_rows} records with {total_cols} attributes"
            )

        return ". ".join(enhanced_parts)

    def process_csv_table(self, csv_path, related_data=None):
        """Process any CSV table dynamically"""
        print(f"Processing CSV: {csv_path}")

        # Load the CSV
        df = pd.read_csv(csv_path, sep=";")
        table_name = Path(csv_path).stem

        # Analyze table structure (for metadata, not for hardcoded rules)
        print(
            f"  Table: {table_name} with {len(df)} rows and {len(df.columns)} columns"
        )
        print(f"  Columns: {list(df.columns)}")

        # Generate text representation for each row
        for idx, row in df.iterrows():
            try:
                text = self.create_smart_text_representation(row, df, table_name)

                # Create metadata
                metadata = {
                    "id": f"{table_name}_{idx}",
                    "type": "database_table",
                    "source_table": table_name,
                    "source_file": str(csv_path),
                    "row_index": idx,
                    "table_columns": list(df.columns),
                    "total_rows": len(df),
                    "total_columns": len(df.columns),
                }

                # Try to find a primary identifier (any column with high uniqueness)
                for col in df.columns:
                    if pd.notna(row.get(col)):
                        analysis = self.analyze_column_content(df, col)
                        if (
                            analysis.get("unique_ratio", 0) > 0.9
                        ):  # Highly unique = likely ID
                            metadata["entity_id"] = row[col]
                            break

                self.embeddings_data["texts"].append(text)
                self.embeddings_data["metadata"].append(metadata)

            except Exception as e:
                print(f"  Error processing row {idx}: {e}")
                continue

        print(f"  Processed {len(df)} rows from {table_name}")

    def flatten_json_to_text(self, json_data, prefix=""):
        """Recursively flatten JSON to natural language"""
        text_parts = []

        if isinstance(json_data, dict):
            for key, value in json_data.items():
                if isinstance(value, dict):
                    # Recursive case
                    nested_parts = self.flatten_json_to_text(value, f"{prefix}{key}: ")
                    text_parts.extend(nested_parts)
                elif isinstance(value, list):
                    # Handle lists
                    list_items = [str(item) for item in value]
                    text_parts.append(f"{prefix}{key}: {', '.join(list_items)}")
                else:
                    # Base case
                    text_parts.append(f"{prefix}{key}: {value}")
        elif isinstance(json_data, list):
            for i, item in enumerate(json_data):
                nested_parts = self.flatten_json_to_text(
                    item, f"{prefix}Item {i + 1}: "
                )
                text_parts.extend(nested_parts)
        else:
            text_parts.append(f"{prefix}{json_data}")

        return text_parts

    def analyze_json_content(self, json_data):
        """Analyze JSON content to understand its structure and characteristics"""
        analysis = {
            'total_keys': 0,
            'max_depth': 0,
            'has_arrays': False,
            'has_nested_objects': False,
            'key_types': {},
            'content_indicators': [],
            'data_complexity': 'simple'
        }
        
        def analyze_recursive(data, depth=0):
            analysis['max_depth'] = max(analysis['max_depth'], depth)
            
            if isinstance(data, dict):
                analysis['total_keys'] += len(data.keys())
                for key, value in data.items():
                    # Collect key names for content analysis (but don't hardcode meanings)
                    key_lower = str(key).lower()
                    if len(key_lower) > 2:  # Only meaningful keys
                        analysis['content_indicators'].append(key_lower)
                    
                    if isinstance(value, dict):
                        analysis['has_nested_objects'] = True
                        analyze_recursive(value, depth + 1)
                    elif isinstance(value, list):
                        analysis['has_arrays'] = True
                        if value:  # If list is not empty
                            analyze_recursive(value[0], depth + 1)
                    else:
                        # Categorize value types
                        value_type = type(value).__name__
                        analysis['key_types'][value_type] = analysis['key_types'].get(value_type, 0) + 1
            
            elif isinstance(data, list):
                analysis['has_arrays'] = True
                if data:
                    analyze_recursive(data[0], depth + 1)
        
        analyze_recursive(json_data)
        
        # Determine complexity based on structure
        if analysis['max_depth'] > 3 or analysis['total_keys'] > 20:
            analysis['data_complexity'] = 'complex'
        elif analysis['max_depth'] > 1 or analysis['total_keys'] > 5:
            analysis['data_complexity'] = 'moderate'
        
        return analysis
    
    def create_smart_json_text(self, json_data, filename, parent_document=None):
        """Create smart text representation of JSON data without hardcoded keywords"""
        # Flatten JSON to text parts
        text_parts = self.flatten_json_to_text(json_data)
        
        # Analyze the JSON structure
        analysis = self.analyze_json_content(json_data)
        
        # Create document context
        document_context = parent_document or filename
        base_text = f"Document: {document_context}. "
        
        # Add structural information based on analysis
        if analysis['data_complexity'] == 'complex':
            base_text += f"Complex structured data with {analysis['max_depth']} nested levels and {analysis['total_keys']} data fields. "
        elif analysis['data_complexity'] == 'moderate':
            base_text += f"Structured data with {analysis['total_keys']} fields. "
        
        if analysis['has_arrays']:
            base_text += "Contains multiple data entries. "
        if analysis['has_nested_objects']:
            base_text += "Contains hierarchical information. "
        
        # Add the actual content
        content_text = ". ".join(text_parts)
        full_text = base_text + content_text
        
        return full_text, analysis
    
    def process_json_table(self, json_path, parent_document=None):
        """Process JSON table files dynamically without hardcoded assumptions"""
        print(f"Processing JSON: {json_path}")
        
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                json_data = json.load(f)
            
            # Create smart text representation
            filename = Path(json_path).stem
            full_text, analysis = self.create_smart_json_text(json_data, filename, parent_document)
            
            metadata = {
                "id": f"json_{filename}",
                "type": "json_table",
                "source_file": str(json_path),
                "parent_document": parent_document,
                "json_keys": list(json_data.keys()) if isinstance(json_data, dict) else [],
                "structure_analysis": analysis,
                "content_preview": str(json_data)[:200] + "..." if len(str(json_data)) > 200 else str(json_data)
            }
            
            self.embeddings_data["texts"].append(full_text)
            self.embeddings_data["metadata"].append(metadata)
            
            print(f"  Processed JSON table: {filename}")
            print(f"    Structure: {analysis['total_keys']} keys, {analysis['data_complexity']} complexity, depth {analysis['max_depth']}")
            
        except Exception as e:
            print(f"  Error processing JSON {json_path}: {e}")

    def process_all_data(self, data_dir="../data"):
        """Process all data files in the directory"""
        data_path = Path(data_dir)

        print("🔄 Starting dynamic embedding generation...")

        # 1. Process all CSV files
        csv_files = list(data_path.glob("*.csv"))

        # Load related data for context (like categories for hierarchy)
        related_data = {}
        for csv_file in csv_files:
            if "category" in csv_file.name.lower():
                related_data["categories"] = pd.read_csv(csv_file, sep=";")

        for csv_file in csv_files:
            self.process_csv_table(csv_file, related_data)

        # 2. Process JSON tables from documents
        json_files = list((data_path / "IngestedDocuments").glob("*.json"))

        for json_file in json_files:
            # Extract parent document name
            parent_doc = json_file.stem
            if " Table " in parent_doc:
                parent_doc = parent_doc.split(" Table ")[0]

            self.process_json_table(json_file, parent_doc)

        print(f"✅ Processed {len(self.embeddings_data['texts'])} total text entries")

    def generate_embeddings(self):
        """Generate embeddings for all collected texts"""
        if not self.embeddings_data["texts"]:
            print("No texts to embed!")
            return

        print(
            f"Generating embeddings for {len(self.embeddings_data['texts'])} texts..."
        )

        # Generate embeddings in batches for memory efficiency
        batch_size = 32
        all_embeddings = []

        for i in range(0, len(self.embeddings_data["texts"]), batch_size):
            batch_texts = self.embeddings_data["texts"][i : i + batch_size]
            batch_embeddings = self.model.encode(batch_texts)
            all_embeddings.extend(batch_embeddings)

            print(
                f"  Processed batch {i // batch_size + 1}/{(len(self.embeddings_data['texts']) - 1) // batch_size + 1}"
            )

        self.embeddings_data["embeddings"] = np.array(all_embeddings)
        print(
            f"Generated embeddings shape: {self.embeddings_data['embeddings'].shape}"
        )

    def save_embeddings(self, output_path="knowledge_graph_embeddings.pkl"):
        """Save embeddings with metadata"""
        print(f"💾 Saving embeddings to {output_path}")

        # Get model name safely
        try:
            model_name = getattr(self.model, 'model_name', 'unknown')
            if model_name == 'unknown':
                # Try alternative ways to get model name
                if hasattr(self.model, '_modules') and '0' in self.model._modules:
                    model_name = getattr(self.model._modules['0'], 'model_name', 'all-MiniLM-L6-v2')
                else:
                    model_name = 'all-MiniLM-L6-v2'  # Default fallback
        except:
            model_name = 'all-MiniLM-L6-v2'

        # Add generation metadata
        self.embeddings_data["generation_info"] = {
            "model_name": model_name,
            "total_entries": len(self.embeddings_data["texts"]),
            "embedding_dimension": self.embeddings_data["embeddings"].shape[1]
            if len(self.embeddings_data["embeddings"]) > 0
            else 0,
            "generation_timestamp": pd.Timestamp.now().isoformat(),
        }

        with open(output_path, "wb") as f:
            pickle.dump(self.embeddings_data, f)

        print(
            f"Saved {len(self.embeddings_data['texts'])} embeddings to {output_path}"
        )

        # Print summary
        print("EMBEDDING SUMMARY:")
        print(f"   Total embeddings: {len(self.embeddings_data['texts'])}")
        print(f"   Embedding dimension: {self.embeddings_data['embeddings'].shape[1]}")

        # Count by type
        type_counts = {}
        for metadata in self.embeddings_data["metadata"]:
            type_key = metadata["type"]
            type_counts[type_key] = type_counts.get(type_key, 0) + 1

        for type_name, count in type_counts.items():
            print(f"   {type_name}: {count}")

    def load_embeddings(self, input_path="knowledge_graph_embeddings.pkl"):
        """Load previously saved embeddings"""
        print(f"📂 Loading embeddings from {input_path}")

        with open(input_path, "rb") as f:
            self.embeddings_data = pickle.load(f)

        print(f"✅ Loaded {len(self.embeddings_data['texts'])} embeddings")
        return self.embeddings_data
    
    # 🎯 SIMPLE ALTERNATIVE METHODS (if you prefer np.save approach)
    def save_embeddings_simple(self, base_path="embeddings"):
        """Save embeddings using simple np.save approach"""
        if not self.embeddings_data["embeddings"]:
            print("No embeddings to save!")
            return
            
        print(f"💾 Saving embeddings using simple np.save approach...")
        
        # Save just the embeddings matrix
        np.save(f"{base_path}.npy", self.embeddings_data["embeddings"])
        
        # Save texts and metadata separately as JSON for human readability
        import json
        simple_data = {
            "texts": self.embeddings_data["texts"],
            "metadata": self.embeddings_data["metadata"]
        }
        
        with open(f"{base_path}_metadata.json", "w") as f:
            json.dump(simple_data, f, indent=2)
        
        print(f"✅ Saved:")
        print(f"   - {base_path}.npy (embeddings matrix)")
        print(f"   - {base_path}_metadata.json (texts and metadata)")
    
    def load_embeddings_simple(self, base_path="embeddings"):
        """Load embeddings using simple np.load approach"""
        import json
        
        print(f"📂 Loading embeddings using simple np.load approach...")
        
        # Load embeddings matrix
        embeddings = np.load(f"{base_path}.npy")
        
        # Load texts and metadata
        with open(f"{base_path}_metadata.json", "r") as f:
            simple_data = json.load(f)
        
        self.embeddings_data = {
            "embeddings": embeddings,
            "texts": simple_data["texts"],
            "metadata": simple_data["metadata"]
        }
        
        print(f"✅ Loaded {len(self.embeddings_data['texts'])} embeddings")
        return self.embeddings_data
    
    # 🔗 NEO4J INTEGRATION METHODS
    def create_embedding_index_mapping(self):
        """Create a mapping from metadata IDs to embedding indices"""
        embedding_index = {}
        for idx, metadata in enumerate(self.embeddings_data["metadata"]):
            # Create multiple ways to find embeddings
            embedding_index[metadata["id"]] = idx
            
            # For database tables, also index by entity_id if available
            if metadata["type"] == "database_table" and "entity_id" in metadata:
                entity_key = f"{metadata['source_table']}_{metadata['entity_id']}"
                embedding_index[entity_key] = idx
            
            # For JSON tables, index by source file
            if metadata["type"] == "json_table":
                embedding_index[metadata["source_file"]] = idx
        
        return embedding_index
    
    def get_embedding_by_id(self, item_id):
        """Get embedding and metadata for a specific item"""
        embedding_index = self.create_embedding_index_mapping()
        
        if item_id in embedding_index:
            idx = embedding_index[item_id]
            return {
                "embedding": self.embeddings_data["embeddings"][idx],
                "text": self.embeddings_data["texts"][idx],
                "metadata": self.embeddings_data["metadata"][idx],
                "index": idx
            }
        return None
    
    def save_embeddings_with_neo4j_integration(self, output_path="knowledge_graph_embeddings.pkl", neo4j_builder=None):
        """Save embeddings and update Neo4j nodes with embedding references"""
        # Save embeddings normally first
        self.save_embeddings(output_path)
        
        if neo4j_builder is None:
            print("⚠️ No Neo4j builder provided - skipping graph integration")
            return
        
        print("🔗 Integrating embeddings with Neo4j knowledge graph...")
        
        # Create embedding index
        embedding_index = self.create_embedding_index_mapping()
        
        with neo4j_builder.driver.session() as session:
            # Update Product nodes with embedding references
            for idx, metadata in enumerate(self.embeddings_data["metadata"]):
                if metadata["type"] == "database_table":
                    source_table = metadata["source_table"]
                    
                    if source_table == "Product" and "entity_id" in metadata:
                        # Update Product nodes
                        query = """
                        MATCH (p:Product {product_id: $product_id})
                        SET p.embedding_id = $embedding_id,
                            p.embedding_index = $embedding_index,
                            p.embedding_file = $embedding_file,
                            p.has_embedding = true
                        """
                        session.run(query, 
                                   product_id=int(metadata["entity_id"]),
                                   embedding_id=metadata["id"],
                                   embedding_index=idx,
                                   embedding_file=output_path)
                    
                    elif source_table == "ProductCategory" and "entity_id" in metadata:
                        # Update Category nodes if they exist
                        query = """
                        MATCH (c:Category {category_id: $category_id})
                        SET c.embedding_id = $embedding_id,
                            c.embedding_index = $embedding_index,
                            c.embedding_file = $embedding_file,
                            c.has_embedding = true
                        """
                        try:
                            session.run(query,
                                       category_id=int(metadata["entity_id"]),
                                       embedding_id=metadata["id"],
                                       embedding_index=idx,
                                       embedding_file=output_path)
                        except:
                            pass  # Category nodes might not exist
                
                elif metadata["type"] == "json_table":
                    # Update Document/Annotation nodes
                    parent_doc = metadata.get("parent_document", "")
                    if parent_doc:
                        query = """
                        MATCH (d:Document) 
                        WHERE d.document_name CONTAINS $parent_doc
                        SET d.embedding_id = $embedding_id,
                            d.embedding_index = $embedding_index,
                            d.embedding_file = $embedding_file,
                            d.has_embedding = true
                        """
                        session.run(query,
                                   parent_doc=parent_doc,
                                   embedding_id=metadata["id"],
                                   embedding_index=idx,
                                   embedding_file=output_path)
        
        print(f"✅ Updated Neo4j nodes with embedding references")
        print(f"   - Embedding file: {output_path}")
        print(f"   - Total embeddings: {len(self.embeddings_data['embeddings'])}")


# 🔗 NEO4J + EMBEDDINGS HYBRID SEARCH CLASS
class HybridSearchEngine:
    def __init__(self, neo4j_builder, embedding_generator):
        """Initialize hybrid search with both Neo4j and embeddings"""
        self.neo4j = neo4j_builder
        self.embeddings = embedding_generator
        self.embedding_index = embedding_generator.create_embedding_index_mapping()
    
    def semantic_graph_search(self, query, top_k=5, graph_filter=None):
        """Search using embeddings + filter by graph properties"""
        print(f"🔍 Hybrid search for: '{query}'")
        
        # 1. Get semantic matches from embeddings
        query_embedding = self.embeddings.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.embeddings.embeddings_data["embeddings"])[0]
        
        # Get top candidates (more than needed)
        top_indices = np.argsort(similarities)[::-1][:top_k * 3]
        
        results = []
        with self.neo4j.driver.session() as session:
            for idx in top_indices:
                metadata = self.embeddings.embeddings_data["metadata"][idx]
                
                # 2. Check if this item exists in Neo4j and matches graph filter
                if metadata["type"] == "database_table" and metadata["source_table"] == "Product":
                    if "entity_id" in metadata:
                        # Query Neo4j for additional context
                        graph_query = """
                        MATCH (p:Product {product_id: $product_id})
                        OPTIONAL MATCH (p)-[r]-(related)
                        RETURN p, collect(DISTINCT type(r)) as relationships, 
                               collect(DISTINCT labels(related)) as related_types
                        """
                        
                        graph_result = session.run(graph_query, product_id=int(metadata["entity_id"]))
                        graph_data = graph_result.single()
                        
                        if graph_data and graph_data["p"]:
                            # Apply graph filter if provided
                            if graph_filter is None or self._matches_graph_filter(graph_data, graph_filter):
                                result = {
                                    "similarity": similarities[idx],
                                    "text": self.embeddings.embeddings_data["texts"][idx],
                                    "metadata": metadata,
                                    "graph_data": dict(graph_data["p"]),
                                    "relationships": graph_data["relationships"],
                                    "related_types": graph_data["related_types"]
                                }
                                results.append(result)
                                
                                if len(results) >= top_k:
                                    break
                else:
                    # Non-product items (documents, etc.)
                    result = {
                        "similarity": similarities[idx],
                        "text": self.embeddings.embeddings_data["texts"][idx],
                        "metadata": metadata,
                        "graph_data": None
                    }
                    results.append(result)
                    
                    if len(results) >= top_k:
                        break
        
        return results
    
    def _matches_graph_filter(self, graph_data, graph_filter):
        """Check if graph data matches the filter criteria"""
        # Example filters: {"category": "Bikes", "price_range": [100, 500]}
        product = graph_data["p"]
        
        if "category" in graph_filter:
            if product.get("category_name") != graph_filter["category"]:
                return False
        
        if "price_range" in graph_filter:
            price = product.get("list_price", 0)
            min_price, max_price = graph_filter["price_range"]
            if not (min_price <= price <= max_price):
                return False
        
        return True
    
    def find_similar_products_in_category(self, product_id, category=None, top_k=5):
        """Find products similar to given product, optionally within a category"""
        print(f"🔗 Finding similar products to {product_id}")
        
        # Get the embedding for the source product
        entity_key = f"Product_{product_id}"
        source_embedding_data = self.embeddings.get_embedding_by_id(entity_key)
        
        if not source_embedding_data:
            print(f"❌ No embedding found for product {product_id}")
            return []
        
        # Calculate similarities
        source_embedding = source_embedding_data["embedding"].reshape(1, -1)
        similarities = cosine_similarity(source_embedding, self.embeddings.embeddings_data["embeddings"])[0]
        
        # Get candidates
        top_indices = np.argsort(similarities)[::-1][1:top_k*2]  # Skip self
        
        results = []
        with self.neo4j.driver.session() as session:
            for idx in top_indices:
                metadata = self.embeddings.embeddings_data["metadata"][idx]
                
                if (metadata["type"] == "database_table" and 
                    metadata["source_table"] == "Product" and 
                    "entity_id" in metadata):
                    
                    # Get graph context
                    graph_query = """
                    MATCH (p:Product {product_id: $product_id})
                    RETURN p
                    """
                    
                    graph_result = session.run(graph_query, product_id=int(metadata["entity_id"]))
                    graph_data = graph_result.single()
                    
                    if graph_data and graph_data["p"]:
                        product = dict(graph_data["p"])
                        
                        # Apply category filter if specified
                        if category is None or product.get("category_name") == category:
                            result = {
                                "similarity": similarities[idx],
                                "product_id": product["product_id"],
                                "name": product["name"],
                                "category": product.get("category_name", "Unknown"),
                                "price": product.get("list_price", 0),
                                "text": self.embeddings.embeddings_data["texts"][idx]
                            }
                            results.append(result)
                            
                            if len(results) >= top_k:
                                break
        
        return results


print("🎯 Dynamic Embedding Generator with Neo4j Integration created!")


🎯 Dynamic Embedding Generator with Neo4j Integration created!


In [27]:
# 🚀 GENERATE EMBEDDINGS FOR ALL DATA
# This cell will process all your data and generate embeddings

# Set this to True to actually generate embeddings
generate_embeddings = True

if generate_embeddings:
    print("=== DYNAMIC EMBEDDING GENERATION ===\n")

    # Initialize the generator
    embedding_generator = DynamicEmbeddingGenerator(model_name="all-MiniLM-L6-v2")

    # Process all data (automatically detects table structures)
    embedding_generator.process_all_data("../data")

    print("\n" + "=" * 50)
    print("📋 PREVIEW OF GENERATED TEXTS:")
    print("=" * 50)

    # Show examples of the generated texts
    for i, (text, metadata) in enumerate(
        zip(
            embedding_generator.embeddings_data["texts"][:5],
            embedding_generator.embeddings_data["metadata"][:5],
        )
    ):
        print(f"\n🔸 Example {i + 1} ({metadata['type']}):")
        print(
            f"   Source: {metadata.get('source_table', metadata.get('source_file', 'unknown'))}"
        )
        print(f"   Text: {text[:150]}...")

    # Generate the actual embeddings
    embedding_generator.generate_embeddings()

    # Save embeddings with Neo4j integration
    try:
        # Try to connect to Neo4j for integration
        from notebooks.main import KnowledgeGraphBuilder, NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD
        kg_builder = KnowledgeGraphBuilder(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
        
        # Save with Neo4j integration
        embedding_generator.save_embeddings_with_neo4j_integration(
            "../data/knowledge_graph_embeddings.pkl", 
            kg_builder
        )
        kg_builder.close()
        
    except Exception as e:
        print(f"⚠️ Neo4j integration failed: {e}")
        print("Saving embeddings without Neo4j integration...")
        # Fallback to regular save
        embedding_generator.save_embeddings("../data/knowledge_graph_embeddings.pkl")

    print("\n🎉 EMBEDDINGS GENERATED AND SAVED!")
    print("You can now use these embeddings for semantic search, similarity, etc.")

else:
    print("⏸️ Embedding generation skipped (set generate_embeddings = True to run)")
    print("This process will:")
    print("1. 🔍 Automatically detect the structure of all your CSV tables")
    print("2. 📝 Generate natural language descriptions for each row")
    print("3. 🧠 Create embeddings using SentenceTransformers")
    print("4. 💾 Save everything with rich metadata for retrieval")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


=== DYNAMIC EMBEDDING GENERATION ===

Loading embedding model: all-MiniLM-L6-v2
🔄 Starting dynamic embedding generation...
Processing CSV: ../data/Product.csv
  Table: Product with 100 rows and 17 columns
  Columns: ['ProductID', 'Name', 'ProductNumber', 'Color', 'StandardCost', 'ListPrice', 'Size', 'Weight', 'ProductCategoryID', 'ProductModelID', 'SellStartDate', 'SellEndDate', 'DiscontinuedDate', 'ThumbNailPhoto', 'ThumbnailPhotoFileName', 'rowguid', 'ModifiedDate']
  Processed 100 rows from Product
Processing CSV: ../data/SalesOrderHeader.csv
  Table: SalesOrderHeader with 32 rows and 22 columns
  Columns: ['SalesOrderID', 'RevisionNumber', 'OrderDate', 'DueDate', 'ShipDate', 'Status', 'OnlineOrderFlag', 'SalesOrderNumber', 'PurchaseOrderNumber', 'AccountNumber', 'CustomerID', 'ShipToAddressID', 'BillToAddressID', 'ShipMethod', 'CreditCardApprovalCode', 'SubTotal', 'TaxAmt', 'Freight', 'TotalDue', 'Comment', 'rowguid', 'ModifiedDate']
  Processed 32 rows from SalesOrderHeader
Proces

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 1/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 2/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 3/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 4/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 5/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 6/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 7/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 8/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 9/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 10/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 11/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 12/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 13/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 14/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 15/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 16/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 17/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 18/19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed batch 19/19
Generated embeddings shape: (579, 384)
⚠️ Neo4j integration failed: No module named 'notebooks'
Saving embeddings without Neo4j integration...
💾 Saving embeddings to ../data/knowledge_graph_embeddings.pkl
Saved 579 embeddings to ../data/knowledge_graph_embeddings.pkl
EMBEDDING SUMMARY:
   Total embeddings: 579
   Embedding dimension: 384
   database_table: 573
   json_table: 6

🎉 EMBEDDINGS GENERATED AND SAVED!
You can now use these embeddings for semantic search, similarity, etc.


In [28]:
# 🔍 SEMANTIC SEARCH AND SIMILARITY UTILITIES
# Use the generated embeddings for intelligent queries

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


class EmbeddingSearchEngine:
    def __init__(self, embeddings_path="knowledge_graph_embeddings.pkl"):
        """Initialize search engine with pre-generated embeddings"""
        self.generator = DynamicEmbeddingGenerator()
        self.embeddings_data = self.generator.load_embeddings(embeddings_path)
        self.embeddings = self.embeddings_data["embeddings"]
        self.metadata = self.embeddings_data["metadata"]
        self.texts = self.embeddings_data["texts"]

    def semantic_search(self, query, top_k=5, filter_type=None):
        """Search for similar content using semantic similarity"""
        print(f"🔍 Searching for: '{query}'")

        # Generate embedding for query
        query_embedding = self.generator.model.encode([query])

        # Filter by type if specified
        valid_indices = range(len(self.embeddings))
        if filter_type:
            valid_indices = [
                i for i, meta in enumerate(self.metadata) if meta["type"] == filter_type
            ]
            print(f"   Filtering to {filter_type} only ({len(valid_indices)} items)")

        if not valid_indices:
            print("No items match the filter criteria!")
            return []

        # Calculate similarities
        filtered_embeddings = self.embeddings[valid_indices]
        similarities = cosine_similarity(query_embedding, filtered_embeddings)[0]

        # Get top results
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = []
        for rank, idx in enumerate(top_indices):
            original_idx = valid_indices[idx]
            result = {
                "rank": rank + 1,
                "similarity": similarities[idx],
                "text": self.texts[original_idx],
                "metadata": self.metadata[original_idx],
            }
            results.append(result)

        return results

    def find_similar_items(self, item_id, top_k=5):
        """Find items similar to a specific item in the database"""
        # Find the item
        item_idx = None
        for i, meta in enumerate(self.metadata):
            if meta["id"] == item_id or str(meta.get("entity_id", "")) == str(item_id):
                item_idx = i
                break

        if item_idx is None:
            print(f"Item {item_id} not found!")
            return []

        print(f"🔍 Finding items similar to: {self.texts[item_idx][:100]}...")

        # Calculate similarities with all other items
        item_embedding = self.embeddings[item_idx : item_idx + 1]
        similarities = cosine_similarity(item_embedding, self.embeddings)[0]

        # Exclude the item itself
        similarities[item_idx] = -1

        # Get top results
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = []
        for rank, idx in enumerate(top_indices):
            if similarities[idx] > 0:  # Only positive similarities
                result = {
                    "rank": rank + 1,
                    "similarity": similarities[idx],
                    "text": self.texts[idx],
                    "metadata": self.metadata[idx],
                }
                results.append(result)

        return results

    def get_statistics(self):
        """Get statistics about the embedding database"""
        stats = {
            "total_embeddings": len(self.embeddings),
            "embedding_dimension": self.embeddings.shape[1],
            "types": {},
        }

        for meta in self.metadata:
            type_key = meta["type"]
            stats["types"][type_key] = stats["types"].get(type_key, 0) + 1

        return stats


def demo_semantic_search():
    """Demonstrate semantic search capabilities"""
    try:
        # Initialize search engine
        search_engine = EmbeddingSearchEngine("../data/knowledge_graph_embeddings.pkl")

        print("=== SEMANTIC SEARCH DEMO ===\n")

        # Show statistics
        stats = search_engine.get_statistics()
        print(f"📊 Database contains {stats['total_embeddings']} items:")
        for type_name, count in stats["types"].items():
            print(f"   {type_name}: {count}")

        print(f"\n🧠 Embedding dimension: {stats['embedding_dimension']}")

        # Example searches
        example_queries = [
            "mountain bike frame",
            "red helmet",
            "road cycling equipment",
            "vintage bicycle parts",
        ]

        for query in example_queries:
            print(f"\n{'=' * 60}")
            print(f"🔍 QUERY: '{query}'")
            print("=" * 60)

            results = search_engine.semantic_search(query, top_k=3)

            for result in results:
                print(
                    f"\n🏆 Rank {result['rank']} (similarity: {result['similarity']:.3f})"
                )
                print(f"   Type: {result['metadata']['type']}")
                print(f"   Source: {result['metadata'].get('source_table', 'unknown')}")
                print(f"   Text: {result['text'][:200]}...")

        # Example: Find similar items to a specific product
        print(f"\n{'=' * 60}")
        print("🔗 FINDING SIMILAR ITEMS")
        print("=" * 60)

        # Try to find similar items to the first product
        if search_engine.metadata:
            first_item_id = search_engine.metadata[0]["id"]
            similar_items = search_engine.find_similar_items(first_item_id, top_k=3)

            for item in similar_items:
                print(f"\n🔗 Similarity: {item['similarity']:.3f}")
                print(f"   Type: {item['metadata']['type']}")
                print(f"   Text: {item['text'][:200]}...")

    except FileNotFoundError:
        print("❌ Embeddings file not found!")
        print("Run the embedding generation cell first.")
    except Exception as e:
        print(f"❌ Error: {e}")


print("🔍 Semantic Search Engine ready!")


🔍 Semantic Search Engine ready!


In [31]:
# 🔗 HYBRID NEO4J + EMBEDDINGS SEARCH DEMO
# This demonstrates the power of combining graph queries with semantic search

def demo_hybrid_search():
    """Demonstrate hybrid search capabilities"""
    try:
        # Initialize components
        kg_builder = KnowledgeGraphBuilder(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
        embedding_generator = DynamicEmbeddingGenerator()
        embedding_generator.load_embeddings("../data/knowledge_graph_embeddings.pkl")
        
        # Create hybrid search engine
        hybrid_search = HybridSearchEngine(kg_builder, embedding_generator)
        
        print("=== HYBRID SEARCH DEMO ===\n")
        
        # Example 1: Semantic search with graph filtering
        print("🔍 Example 1: Semantic search with price filter")
        print("Query: 'mountain bike' with price range $100-$500")
        
        results = hybrid_search.semantic_graph_search(
            "mountain bike", 
            top_k=3,
            graph_filter={"price_range": [100, 500]}
        )
        
        for i, result in enumerate(results, 1):
            print(f"\n{i}. Similarity: {result['similarity']:.3f}")
            if result['graph_data']:
                print(f"   Product: {result['graph_data']['name']}")
                print(f"   Category: {result['graph_data'].get('category_name', 'Unknown')}")
                print(f"   Price: ${result['graph_data'].get('list_price', 0)}")
                print(f"   Relationships: {result['relationships']}")
            else:
                print(f"   Text: {result['text'][:100]}...")
        
        # Example 2: Find similar products in same category
        print(f"\n{'='*60}")
        print("🔗 Example 2: Find similar products in same category")
        
        # Get first product ID from embeddings
        first_product_metadata = next(
            (m for m in embedding_generator.embeddings_data["metadata"] 
             if m["type"] == "database_table" and m["source_table"] == "Product" and "entity_id" in m),
            None
        )
        
        if first_product_metadata:
            product_id = first_product_metadata["entity_id"]
            print(f"Finding products similar to Product ID: {product_id}")
            
            similar_products = hybrid_search.find_similar_products_in_category(
                product_id, 
                category="Road Bikes",  # Filter to specific category
                top_k=3
            )
            
            for i, product in enumerate(similar_products, 1):
                print(f"\n{i}. Similarity: {product['similarity']:.3f}")
                print(f"   Product: {product['name']} (ID: {product['product_id']})")
                print(f"   Category: {product['category']}")
                print(f"   Price: ${product['price']}")
        
        # Example 3: Pure semantic search across all data types
        print(f"\n{'='*60}")
        print("🌐 Example 3: Cross-modal semantic search")
        print("Finding 'technical specifications' across products AND documents")
        
        results = hybrid_search.semantic_graph_search(
            "technical specifications materials", 
            top_k=5
        )
        
        for i, result in enumerate(results, 1):
            print(f"\n{i}. Similarity: {result['similarity']:.3f}")
            print(f"   Type: {result['metadata']['type']}")
            if result['graph_data']:
                print(f"   Product: {result['graph_data']['name']}")
            else:
                print(f"   Source: {result['metadata'].get('source_file', 'Unknown')}")
            print(f"   Text: {result['text'][:150]}...")
        
        kg_builder.close()
        
        print(f"\n{'='*60}")
        print("🎯 HYBRID SEARCH CAPABILITIES:")
        print("✅ Semantic similarity search across all data types")
        print("✅ Graph-based filtering (category, price, relationships)")
        print("✅ Cross-modal search (products + documents + JSON)")
        print("✅ Relationship-aware results")
        print("✅ Fast embedding lookup with graph context")
        
    except Exception as e:
        print(f"❌ Hybrid search demo failed: {e}")
        print("Make sure Neo4j is running and embeddings are generated!")

# Set to True to run the hybrid search demo
run_hybrid_demo = True

if run_hybrid_demo:
    demo_hybrid_search()
else:
    print("🔗 HYBRID SEARCH DEMO READY")
    print("Set run_hybrid_demo = True to see the hybrid Neo4j + embeddings search!")
    print()
    print("This demo will show:")
    print("1. 🔍 Semantic search with graph filters (e.g., price range, category)")
    print("2. 🔗 Find similar products within specific categories")
    print("3. 🌐 Cross-modal search across products, documents, and JSON data")
    print("4. 🎯 Relationship-aware results combining graph and vector data")
    print()
    print("Benefits of this hybrid approach:")
    print("• 🚀 Fast semantic search using embeddings")
    print("• 🎯 Precise filtering using graph relationships")
    print("• 🔄 Best of both worlds: similarity + structure")
    print("• 📊 Rich context from both vector and graph data")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading embedding model: all-MiniLM-L6-v2
📂 Loading embeddings from ../data/knowledge_graph_embeddings.pkl
✅ Loaded 579 embeddings
=== HYBRID SEARCH DEMO ===

🔍 Example 1: Semantic search with price filter
Query: 'mountain bike' with price range $100-$500
🔍 Hybrid search for: 'mountain bike'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


1. Similarity: 0.652
   Text: Table: ProductDescription. Description: Top-of-the-line competition mountain bike. Performance-enhan...

2. Similarity: 0.569
   Text: Table: ProductDescription. Description: Replacement mountain wheel for the casual to serious rider.....

3. Similarity: 0.566
   Text: Table: ProductDescription. Description: Serious back-country riding. Perfect for all levels of compe...

🔗 Example 2: Find similar products in same category
Finding products similar to Product ID: 680
🔗 Finding similar products to 680

🌐 Example 3: Cross-modal semantic search
Finding 'technical specifications' across products AND documents
🔍 Hybrid search for: 'technical specifications materials'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


1. Similarity: 0.483
   Type: database_table
   Source: ../data/ProductDescription.csv
   Text: Table: ProductDescription. Description: Lightweight aluminum alloy construction.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 851...

2. Similarity: 0.474
   Type: database_table
   Source: ../data/ProductDescription.csv
   Text: Table: ProductDescription. Description: Sturdy alloy features a quick-release hub.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 690...

3. Similarity: 0.470
   Type: database_table
   Source: ../data/ProductDescription.csv
   Text: Table: ProductDescription. Description: Travel in style and comfort. Designed for maximum comfort and safety. Wide gear range takes on all hills. High...

4. Similarity: 0.452
   Type: database_table
   Source: ../data/ProductDescription.csv
   Text: Table: ProductDescription. Description: Our lightest and best quality aluminum frame made from the newest alloy; it is welded and heat-treated for str...

5

In [33]:
# 🧠 INTELLIGENT QUERY PARSING SYSTEM
# This system parses user queries to extract relevant entities using LLM

import json
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define structured output format for query parsing
class ProductEntity(BaseModel):
    name: Optional[str] = None
    features: List[str] = Field(default_factory=list)
    category: Optional[str] = None

class DocumentEntity(BaseModel):
    type: Optional[str] = None
    name: Optional[str] = None

class RelationshipEntity(BaseModel):
    type: Optional[str] = None
    direction: Optional[str] = None

class QueryEntities(BaseModel):
    product: ProductEntity = Field(default_factory=ProductEntity)
    document: DocumentEntity = Field(default_factory=DocumentEntity)
    relationship: RelationshipEntity = Field(default_factory=RelationshipEntity)

class IntelligentQueryParser:
    def __init__(self, base_url="http://localhost:11434/v1", model_name="gemma3:1b"):
        """Initialize the query parser with local LLM"""
        self.client = OpenAI(
            base_url=base_url,
            api_key="local-llm",  # dummy key for local LLM
        )
        self.model_name = model_name
    
    def parse_query(self, query: str) -> Dict[str, Any]:
        """
        Parse user query to extract relevant entities
        
        Args:
            query: User's natural language query
            
        Returns:
            Dict with extracted entities: product, document, relationship
        """
        try:
            # Use structured output parsing
            completion = self.client.beta.chat.completions.parse(
                model=self.model_name,
                messages=[
                    {
                        "role": "system",
                        "content": """
                        You are a query parser for a product knowledge graph system.
                        Extract relevant entities from user queries about products, documents, and relationships.
                        
                        Extract these entities:
                        1. Product:
                           - name: main product name or identifier
                           - features: list of specific features (color, size, material, etc.)
                           - category: product category (bikes, components, clothing, accessories)
                        
                        2. Document:
                           - type: document type (manual, specification, guide, etc.)
                           - name: specific document name
                        
                        3. Relationship:
                           - type: relationship type (compatible_with, similar_to, part_of, etc.)
                           - direction: relationship direction (incoming, outgoing, bidirectional)
                        
                        If an entity is not mentioned in the query, set its values to null.
                        Extract information in English regardless of input language.
                        """,
                    },
                    {"role": "user", "content": query},
                ],
                response_format=QueryEntities,
            )
            
            # Parse the response
            result_str = completion.choices[0].message.content
            result_dict = json.loads(result_str)
            return QueryEntities(**result_dict).model_dump()
            
        except Exception as e:
            print(f"⚠️ Error parsing query: {e}")
            # Return empty structure on error
            return {
                "product": {"name": None, "features": [], "category": None},
                "document": {"type": None, "name": None},
                "relationship": {"type": None, "direction": None},
            }
    
    def create_search_text(self, parsed_query: Dict[str, Any]) -> str:
        """
        Convert parsed query entities back to searchable text
        
        Args:
            parsed_query: Output from parse_query()
            
        Returns:
            Optimized text for embedding search
        """
        search_parts = []
        
        # Add product information
        product = parsed_query.get("product", {})
        if product.get("name"):
            search_parts.append(f"Product: {product['name']}")
        if product.get("category"):
            search_parts.append(f"Category: {product['category']}")
        if product.get("features"):
            features_text = ", ".join(product["features"])
            search_parts.append(f"Features: {features_text}")
        
        # Add document information
        document = parsed_query.get("document", {})
        if document.get("type"):
            search_parts.append(f"Document type: {document['type']}")
        if document.get("name"):
            search_parts.append(f"Document: {document['name']}")
        
        # Add relationship information
        relationship = parsed_query.get("relationship", {})
        if relationship.get("type"):
            search_parts.append(f"Relationship: {relationship['type']}")
        
        return ". ".join(search_parts) if search_parts else ""

print("✅ Intelligent Query Parser created")


✅ Intelligent Query Parser created


In [34]:
# 🔍 COMPLETE EMBEDDING-BASED RAG SEARCH SYSTEM
# This system processes user queries and finds similar content using embeddings

class EmbeddingRAGSystem:
    def __init__(self, embeddings_path="knowledge_graph_embeddings.pkl", 
                 embedding_model_name='all-MiniLM-L6-v2'):
        """
        Initialize the RAG system with embeddings and query parsing
        
        Args:
            embeddings_path: Path to saved embeddings file
            embedding_model_name: Name of the sentence transformer model
        """
        print("🚀 Initializing Embedding RAG System...")
        
        # Initialize components
        self.query_parser = IntelligentQueryParser()
        self.embedding_generator = DynamicEmbeddingGenerator(embedding_model_name)
        
        # Load pre-generated embeddings
        self.load_embeddings(embeddings_path)
        
        print("✅ RAG System ready!")
    
    def load_embeddings(self, embeddings_path):
        """Load embeddings from file"""
        try:
            self.embeddings_data = self.embedding_generator.load_embeddings(embeddings_path)
            self.embeddings_matrix = np.array(self.embeddings_data['embeddings'])
            self.texts = self.embeddings_data['texts']
            self.metadata = self.embeddings_data['metadata']
            
            print(f"📊 Loaded {len(self.texts)} embeddings")
            print(f"🔢 Embedding dimensions: {self.embeddings_matrix.shape[1]}")
            
        except Exception as e:
            print(f"⚠️ Error loading embeddings: {e}")
            self.embeddings_matrix = np.array([])
            self.texts = []
            self.metadata = []
    
    def process_query(self, user_query: str, top_k: int = 5, 
                     similarity_threshold: float = 0.3) -> Dict[str, Any]:
        """
        Complete RAG pipeline: parse query → embed → search → return results
        
        Args:
            user_query: Natural language query from user
            top_k: Number of top results to return
            similarity_threshold: Minimum similarity score to include
            
        Returns:
            Dict with parsed query, search results, and metadata
        """
        print(f"🔍 Processing query: '{user_query}'")
        
        # Step 1: Parse the query using LLM
        print("📝 Step 1: Parsing query...")
        parsed_query = self.query_parser.parse_query(user_query)
        
        # Step 2: Create optimized search text
        search_text = self.query_parser.create_search_text(parsed_query)
        if not search_text:
            search_text = user_query  # Fallback to original query
        
        print(f"🎯 Search text: '{search_text}'")
        
        # Step 3: Generate embedding for search text
        print("🧠 Step 2: Generating query embedding...")
        query_embedding = self.embedding_generator.model.encode([search_text])
        
        # Step 4: Find similar embeddings
        print("🔍 Step 3: Searching for similar content...")
        search_results = self.find_similar_content(
            query_embedding[0], 
            top_k=top_k, 
            similarity_threshold=similarity_threshold
        )
        
        # Step 5: Compile comprehensive results
        results = {
            "original_query": user_query,
            "parsed_query": parsed_query,
            "search_text": search_text,
            "results": search_results,
            "summary": self.create_results_summary(search_results)
        }
        
        print(f"✅ Found {len(search_results)} relevant results")
        return results
    
    def find_similar_content(self, query_embedding, top_k=5, similarity_threshold=0.3):
        """
        Find content similar to query embedding
        
        Args:
            query_embedding: Numpy array of query embedding
            top_k: Number of top results
            similarity_threshold: Minimum similarity score
            
        Returns:
            List of similar content with metadata
        """
        if len(self.embeddings_matrix) == 0:
            return []
        
        # Calculate cosine similarities
        similarities = cosine_similarity([query_embedding], self.embeddings_matrix)[0]
        
        # Get top-k most similar indices
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Filter by threshold and compile results
        results = []
        for idx in top_indices:
            similarity_score = similarities[idx]
            
            if similarity_score >= similarity_threshold:
                result = {
                    "similarity_score": float(similarity_score),
                    "content": self.texts[idx],
                    "metadata": self.metadata[idx],
                    "index": int(idx)
                }
                results.append(result)
        
        return results
    
    def create_results_summary(self, search_results):
        """Create a summary of search results"""
        if not search_results:
            return "No relevant results found."
        
        # Count results by type
        type_counts = {}
        categories = set()
        
        for result in search_results:
            metadata = result["metadata"]
            result_type = metadata.get("type", "unknown")
            type_counts[result_type] = type_counts.get(result_type, 0) + 1
            
            if result_type == "database_table":
                categories.add(metadata.get("table_name", "unknown"))
        
        # Create summary
        summary_parts = [
            f"Found {len(search_results)} relevant results",
            f"Average similarity: {np.mean([r['similarity_score'] for r in search_results]):.3f}"
        ]
        
        if type_counts:
            type_summary = ", ".join([f"{count} {type_}" for type_, count in type_counts.items()])
            summary_parts.append(f"Types: {type_summary}")
        
        if categories:
            summary_parts.append(f"Categories: {', '.join(sorted(categories))}")
        
        return ". ".join(summary_parts)
    
    def search_by_category(self, user_query: str, category_filter: str, top_k: int = 5):
        """
        Search within a specific category only
        
        Args:
            user_query: User's query
            category_filter: Category to filter by (e.g., "database_table", "json_table")
            top_k: Number of results
            
        Returns:
            Filtered search results
        """
        # Get all results first
        all_results = self.process_query(user_query, top_k=50)  # Get more to filter
        
        # Filter by category
        filtered_results = []
        for result in all_results["results"]:
            if result["metadata"].get("type") == category_filter:
                filtered_results.append(result)
                if len(filtered_results) >= top_k:
                    break
        
        # Update results
        all_results["results"] = filtered_results
        all_results["summary"] = f"Category-filtered search: {self.create_results_summary(filtered_results)}"
        
        return all_results
    
    def get_content_statistics(self):
        """Get statistics about the loaded content"""
        if not self.metadata:
            return "No content loaded."
        
        # Count by type
        type_counts = {}
        table_counts = {}
        
        for meta in self.metadata:
            content_type = meta.get("type", "unknown")
            type_counts[content_type] = type_counts.get(content_type, 0) + 1
            
            if content_type == "database_table":
                table_name = meta.get("table_name", "unknown")
                table_counts[table_name] = table_counts.get(table_name, 0) + 1
        
        stats = {
            "total_entries": len(self.metadata),
            "content_types": type_counts,
            "database_tables": table_counts,
            "embedding_dimensions": self.embeddings_matrix.shape[1] if len(self.embeddings_matrix) > 0 else 0
        }
        
        return stats

print("✅ Embedding RAG System created")


✅ Embedding RAG System created


In [36]:
# 🎯 COMPLETE RAG SYSTEM DEMO
# This demonstrates the full pipeline: query parsing → embedding → search → results

def demo_rag_system():
    """Demonstrate the complete RAG system with various queries"""
    
    print("=" * 60)
    print("🚀 COMPLETE RAG SYSTEM DEMO")
    print("=" * 60)
    
    # Initialize the RAG system
    try:
        rag_system = EmbeddingRAGSystem(
            embeddings_path="../data/knowledge_graph_embeddings.pkl",
            embedding_model_name='all-MiniLM-L6-v2'
        )
        
        # Show content statistics
        print("\n📊 CONTENT STATISTICS:")
        stats = rag_system.get_content_statistics()
        for key, value in stats.items():
            if isinstance(value, dict):
                print(f"  {key}:")
                for sub_key, sub_value in value.items():
                    print(f"    - {sub_key}: {sub_value}")
            else:
                print(f"  {key}: {value}")
        
        # Test queries
        test_queries = [
            "Find information about mountain bikes",
            "Show me black bicycle components", 
            "What are the specifications for bike frames?",
            "Find products with size 48 or similar",
            "Show me technical manuals for bikes"
        ]
        
        print("\n" + "=" * 60)
        print("🔍 TESTING DIFFERENT QUERIES")
        print("=" * 60)
        
        for i, query in enumerate(test_queries, 1):
            print(f"\n🎯 TEST {i}: '{query}'")
            print("-" * 50)
            
            # Process the query
            results = rag_system.process_query(query, top_k=3, similarity_threshold=0.2)
            
            # Show parsed query
            print("📝 PARSED QUERY:")
            parsed = results["parsed_query"]
            for entity_type, entity_data in parsed.items():
                if any(entity_data.values() if isinstance(entity_data, dict) else [entity_data]):
                    print(f"  {entity_type}: {entity_data}")
            
            # Show search results
            print(f"\n🔍 SEARCH RESULTS ({len(results['results'])} found):")
            for j, result in enumerate(results["results"], 1):
                score = result["similarity_score"]
                content = result["content"][:150] + "..." if len(result["content"]) > 150 else result["content"]
                metadata = result["metadata"]
                
                print(f"  {j}. Score: {score:.3f}")
                print(f"     Type: {metadata.get('type', 'unknown')}")
                if metadata.get('table_name'):
                    print(f"     Table: {metadata['table_name']}")
                print(f"     Content: {content}")
                print()
            
            print(f"📋 SUMMARY: {results['summary']}")
            print("\n" + "="*30 + "\n")
        
        # Demonstrate category filtering
        print("🎯 CATEGORY-FILTERED SEARCH DEMO:")
        print("-" * 40)
        
        category_results = rag_system.search_by_category(
            "bike components", 
            category_filter="database_table", 
            top_k=3
        )
        
        print(f"Results for 'bike components' in database tables:")
        for result in category_results["results"]:
            print(f"  - {result['metadata'].get('table_name', 'unknown')}: {result['similarity_score']:.3f}")
        
        print(f"\n✅ RAG SYSTEM DEMO COMPLETED SUCCESSFULLY!")
        
    except Exception as e:
        print(f"❌ Error in RAG demo: {e}")
        print("Make sure you have:")
        print("1. Generated embeddings (run the embedding generation cell)")
        print("2. Local LLM running on localhost:11434")
        print("3. All required libraries installed")

# Set to True to run the demo
run_rag_demo = True

if run_rag_demo:
    demo_rag_system()
else:
    print("🎯 RAG SYSTEM DEMO READY")
    print("Set run_rag_demo = True to see the complete RAG system in action!")
    print()
    print("This demo will show:")
    print("1. 🧠 Intelligent query parsing (extracts entities from natural language)")
    print("2. 🔍 Embedding-based semantic search")
    print("3. 📊 Content statistics and result analysis")
    print("4. 🎯 Category-filtered search")
    print("5. 📝 Detailed results with similarity scores")
    print()
    print("Example queries that will work:")
    print("• 'Find mountain bikes with black color'")
    print("• 'Show me technical manuals'") 
    print("• 'What bike components are available?'")
    print("• 'Find products similar to size 48 frames'")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


🚀 COMPLETE RAG SYSTEM DEMO
🚀 Initializing Embedding RAG System...
Loading embedding model: all-MiniLM-L6-v2
📂 Loading embeddings from ../data/knowledge_graph_embeddings.pkl
✅ Loaded 579 embeddings
📊 Loaded 579 embeddings
🔢 Embedding dimensions: 384
✅ RAG System ready!

📊 CONTENT STATISTICS:
  total_entries: 579
  content_types:
    - database_table: 573
    - json_table: 6
  database_tables:
    - unknown: 573
  embedding_dimensions: 384

🔍 TESTING DIFFERENT QUERIES

🎯 TEST 1: 'Find information about mountain bikes'
--------------------------------------------------
🔍 Processing query: 'Find information about mountain bikes'
📝 Step 1: Parsing query...


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


⚠️ Error parsing query: Error code: 404 - {'error': {'message': 'model "gemma2:2b" not found, try pulling it first', 'type': 'api_error', 'param': None, 'code': None}}
🎯 Search text: 'Find information about mountain bikes'
🧠 Step 2: Generating query embedding...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


🔍 Step 3: Searching for similar content...
✅ Found 3 relevant results
📝 PARSED QUERY:

🔍 SEARCH RESULTS (3 found):
  1. Score: 0.567
     Type: json_table
     Content: Document: Mountain Bike Manual. Contains hierarchical information. Gear Usage: Low Gear (1-3): Steep climbs. Gear Usage: Mid Gear (4-6): Flat trails. ...

  2. Score: 0.565
     Type: database_table
     Content: Table: ProductDescription. Description: Top-of-the-line competition mountain bike. Performance-enhancing options include the innovative HL Frame, supe...

  3. Score: 0.547
     Type: database_table
     Content: Table: ProductCategory. Name: Mountain Bikes. ModifiedDate: 2002-06-01 00:00:00.000. ProductCategoryID: 5. ParentProductCategoryID: 1.0

📋 SUMMARY: Found 3 relevant results. Average similarity: 0.560. Types: 1 json_table, 2 database_table. Categories: unknown



🎯 TEST 2: 'Show me black bicycle components'
--------------------------------------------------
🔍 Processing query: 'Show me black bicycle com

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


🔍 Step 3: Searching for similar content...
✅ Found 3 relevant results
📝 PARSED QUERY:

🔍 SEARCH RESULTS (3 found):
  1. Score: 0.524
     Type: database_table
     Content: Table: ProductDescription. Description: Clipless pedals - aluminum.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 850

  2. Score: 0.497
     Type: database_table
     Content: Table: ProductDescription. Description: Value-priced bike with many features of our top-of-the-line models. Has the same light, stiff frame, and the q...

  3. Score: 0.496
     Type: database_table
     Content: Table: ProductDescription. Description: Same technology as all of our Road series bikes.  Perfect all-around bike for road or racing.. ModifiedDate: 2...

📋 SUMMARY: Found 3 relevant results. Average similarity: 0.505. Types: 3 database_table. Categories: unknown



🎯 TEST 3: 'What are the specifications for bike frames?'
--------------------------------------------------
🔍 Processing query: 'What are the specificatio

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


🔍 Step 3: Searching for similar content...
✅ Found 3 relevant results
📝 PARSED QUERY:

🔍 SEARCH RESULTS (3 found):
  1. Score: 0.614
     Type: database_table
     Content: Table: ProductDescription. Description: Same technology as all of our Road series bikes, but the frame is sized for a woman.  Perfect all-around bike ...

  2. Score: 0.561
     Type: database_table
     Content: Table: ProductDescription. Description: Lightweight butted aluminum frame provides a more upright riding position for a trip around town.  Our ground-...

  3. Score: 0.533
     Type: database_table
     Content: Table: ProductDescription. Description: Value-priced bike with many features of our top-of-the-line models. Has the same light, stiff frame, and the q...

📋 SUMMARY: Found 3 relevant results. Average similarity: 0.569. Types: 3 database_table. Categories: unknown



🎯 TEST 4: 'Find products with size 48 or similar'
--------------------------------------------------
🔍 Processing query: 'Find product

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


🔍 Step 3: Searching for similar content...
✅ Found 3 relevant results
📝 PARSED QUERY:

🔍 SEARCH RESULTS (3 found):
  1. Score: 0.440
     Type: database_table
     Content: Table: Product. Name: HL Mountain Frame - Silver, 48. ProductNumber: FR-M94S-52. Color: Silver. Size: 48. SellStartDate: 2005-07-01 00:00:00.000. Sell...

  2. Score: 0.440
     Type: database_table
     Content: Table: Product. Name: LL Road Frame - Red, 48. ProductNumber: FR-R38R-48. Color: Red. Size: 48. SellStartDate: 2005-07-01 00:00:00.000. SellEndDate: 2...

  3. Score: 0.438
     Type: database_table
     Content: Table: Product. Name: LL Road Frame - Red, 52. ProductNumber: FR-R38R-52. Color: Red. Size: 52. SellStartDate: 2005-07-01 00:00:00.000. SellEndDate: 2...

📋 SUMMARY: Found 3 relevant results. Average similarity: 0.439. Types: 3 database_table. Categories: unknown



🎯 TEST 5: 'Show me technical manuals for bikes'
--------------------------------------------------
🔍 Processing query: 'Show me techni

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 404 Not Found"


🔍 Step 3: Searching for similar content...
✅ Found 3 relevant results
📝 PARSED QUERY:

🔍 SEARCH RESULTS (3 found):
  1. Score: 0.550
     Type: database_table
     Content: Table: ProductDescription. Description: Same technology as all of our Road series bikes.  Perfect all-around bike for road or racing.. ModifiedDate: 2...

  2. Score: 0.488
     Type: json_table
     Content: Document: Mountain Bike Manual. Contains hierarchical information. Gear Usage: Low Gear (1-3): Steep climbs. Gear Usage: Mid Gear (4-6): Flat trails. ...

  3. Score: 0.486
     Type: database_table
     Content: Table: ProductDescription. Description: This bike is ridden by race winners. Developed with the Adventure Works Cycles professional race team, it has ...

📋 SUMMARY: Found 3 relevant results. Average similarity: 0.508. Types: 2 database_table, 1 json_table. Categories: unknown


🎯 CATEGORY-FILTERED SEARCH DEMO:
----------------------------------------
🔍 Processing query: 'bike components'
📝 Step 1: Par

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Step 3: Searching for similar content...
✅ Found 50 relevant results
Results for 'bike components' in database tables:
  - unknown: 0.526
  - unknown: 0.525
  - unknown: 0.503

✅ RAG SYSTEM DEMO COMPLETED SUCCESSFULLY!


In [32]:
# 🎯 RUN SEMANTIC SEARCH DEMO
# This demonstrates how to use the embeddings for intelligent search

# Set to True to run the demo
run_demo = True

if run_demo:
    demo_semantic_search()
else:
    print("🎯 SEMANTIC SEARCH DEMO READY")
    print("Set run_demo = True to see the search engine in action!")
    print()
    print("What this demo will show:")
    print("1. 📊 Statistics about your embedded knowledge base")
    print("2. 🔍 Semantic search examples (find items by natural language)")
    print("3. 🔗 Similarity search (find items similar to a specific one)")
    print("4. 🎯 Filtered search (e.g., only database tables or only JSON files)")
    print()
    print("Example queries that will work:")
    print("• 'mountain bike frame' - finds bike-related products")
    print("• 'red helmet' - finds red-colored safety equipment")
    print("• 'vintage bicycle parts' - finds historical/vintage items")
    print("• 'technical specifications' - finds detailed product info")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading embedding model: all-MiniLM-L6-v2
📂 Loading embeddings from ../data/knowledge_graph_embeddings.pkl
✅ Loaded 579 embeddings
=== SEMANTIC SEARCH DEMO ===

📊 Database contains 579 items:
   database_table: 573
   json_table: 6

🧠 Embedding dimension: 384

🔍 QUERY: 'mountain bike frame'
🔍 Searching for: 'mountain bike frame'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🏆 Rank 1 (similarity: 0.607)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Lightweight butted aluminum frame provides a more upright riding position for a trip around town.  Our ground-breaking design provides optimum comfort.. Modifie...

🏆 Rank 2 (similarity: 0.595)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Top-of-the-line competition mountain bike. Performance-enhancing options include the innovative HL Frame, super-smooth front suspension, and traction for all te...

🏆 Rank 3 (similarity: 0.550)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Each frame is hand-crafted in our Bothell facility to the optimum diameter and wall-thickness required of a premium mountain frame. The heat-treated welded alum...

🔍 QUERY: 'red helmet'
🔍 Searching for: 'red helmet'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🏆 Rank 1 (similarity: 0.546)
   Type: database_table
   Source: Product
   Text: Table: Product. Name: Sport-100 Helmet, Red. ProductNumber: HL-U509-R. Color: Red. SellStartDate: 2005-07-01 00:00:00.000. ThumbNailPhoto: 0x47494638396150003100F70000000000800000008000808000000080800...

🏆 Rank 2 (similarity: 0.516)
   Type: database_table
   Source: Product
   Text: Table: Product. Name: Sport-100 Helmet, Black. ProductNumber: HL-U509. Color: Black. SellStartDate: 2005-07-01 00:00:00.000. ThumbNailPhoto: 0x47494638396150003100F700000000008000000080008080000000808...

🏆 Rank 3 (similarity: 0.509)
   Type: database_table
   Source: Product
   Text: Table: Product. Name: Sport-100 Helmet, Blue. ProductNumber: HL-U509-B. Color: Blue. SellStartDate: 2005-07-01 00:00:00.000. ThumbNailPhoto: 0x47494638396150003100F700000000008000000080008080000000808...

🔍 QUERY: 'road cycling equipment'
🔍 Searching for: 'road cycling equipment'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🏆 Rank 1 (similarity: 0.583)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Same technology as all of our Road series bikes.  Perfect all-around bike for road or racing.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 321...

🏆 Rank 2 (similarity: 0.541)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Suitable for any type of riding, on or off-road. Fits any budget. Smooth-shifting with a comfortable ride.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescri...

🏆 Rank 3 (similarity: 0.509)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Travel in style and comfort. Designed for maximum comfort and safety. Wide gear range takes on all hills. High-tech aluminum alloy construction provides durabil...

🔍 QUERY: 'vintage bicycle parts'
🔍 Searching for: 'vintage bicycle parts'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🏆 Rank 1 (similarity: 0.484)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Clipless pedals - aluminum.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 850...

🏆 Rank 2 (similarity: 0.458)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Value-priced bike with many features of our top-of-the-line models. Has the same light, stiff frame, and the quick acceleration we're famous for.. ModifiedDate:...

🏆 Rank 3 (similarity: 0.457)
   Type: database_table
   Source: ProductDescription
   Text: Table: ProductDescription. Description: Replacement rear wheel for entry-level cyclist.. ModifiedDate: 2007-06-01 00:00:00.000. ProductDescriptionID: 870...

🔗 FINDING SIMILAR ITEMS
🔍 Finding items similar to: Table: Product. Name: HL Road Frame - Black, 58. ProductNumber: FR-R92B-58. Color: Black. Size: 58. ...

🔗 Similarity: 0.989
   Type: database_table
   Text: Table: Produ