In [1]:
import pandas as pd
import json
import ast

from typing import Dict, List, Any
from tqdm import tqdm
from langchain_openai import AzureOpenAIEmbeddings, AzureOpenAI
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "pillsgap")

In [2]:
class Neo4jImporter:
    def __init__(
        self,
        uri: str = "bolt://localhost:7687",
        user: str = "neo4j",
        database: str = "neo4j",
        password: str = "pillsgap",
    ):
        self.driver = GraphDatabase.driver(
            uri, auth=(user, password), database=database
        )
        self.model = SentenceTransformer("nomic-ai/modernbert-embed-base")

    def setup_constraints(self):
        with self.driver.session() as session:
            session.run(
                "CREATE CONSTRAINT IF NOT EXISTS FOR (p:Product) REQUIRE p.title IS UNIQUE"
            )
            session.run(
                "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Category) REQUIRE c.name IS UNIQUE"
            )
            session.run(
                "CREATE CONSTRAINT IF NOT EXISTS FOR (s:Store) REQUIRE s.name IS UNIQUE"
            )

    def create_product(
        self, tx, row: Dict[str, Any], embeddings: Dict[str, List[float]]
    ):
        query = """
        MERGE (p:Product {title: $title})
        SET p.description = $description,
            p.average_rating = toFloat($average_rating),
            p.rating_number = toInteger($rating_number),
            p.price = toFloat($price),
            p.features = $features,
            p.details = $details,
            p.image = $image,
            p.title_embedding = $title_embedding,
            p.description_embedding = $desc_embedding
        WITH p
        MERGE (s:Store {name: $store})
        MERGE (p)-[:SOLD_BY]->(s)
        WITH p
        MERGE (c:Category {name: $main_category})
        MERGE (p)-[:IS_OF_CATEGORY]->(c)
        """

        features = (
            json.dumps(row["features"])
            if isinstance(row["features"], (dict, list))
            else str(row["features"])
        )
        details = (
            json.dumps(row["details"])
            if isinstance(row["details"], (dict, list))
            else str(row["details"])
        )

        tx.run(
            query,
            title=row["title"],
            description=row["description"],
            average_rating=row["average_rating"],
            rating_number=row["rating_number"],
            price=row["price"],
            features=features,
            details=details,
            image=row["image"],
            store=row["store"],
            main_category=row["main_category"],
            title_embedding=embeddings["title"],
            desc_embedding=embeddings["description"],
        )

    def import_data(self, data_path: str, batch_size: int = 100):
        df = pd.read_parquet(data_path)
        self.setup_constraints()

        for i in tqdm(range(0, len(df), batch_size)):
            batch = df.iloc[i : i + batch_size]
            with self.driver.session() as session:
                for _, row in batch.iterrows():
                    embeddings = {
                        "title": self.model.encode(row["title"]).tolist(),
                        "description": self.model.encode(row["description"]).tolist(),
                    }
                    session.execute_write(self.create_product, row, embeddings)

        self.driver.close()

In [3]:
importer = Neo4jImporter(database="products")

In [None]:
importer.import_data("./dataset/amazon_products_2023.parquet", batch_size=50)

## Sim Search

In [12]:
class Neo4jSearch:
    driver = None
    model = None

    @classmethod
    def initialize(
        cls,
        uri: str = "bolt://localhost:7687",
        user: str = "neo4j",
        database: str = "neo4j",
        password: str = "pillsgap",
    ):
        cls.driver = GraphDatabase.driver(uri, auth=(user, password), database=database)
        cls.model = SentenceTransformer("jinaai/jina-embeddings-v3")

    @classmethod
    def generate_embeddings(cls, string: str) -> List[float]:
        return cls.model.encode(string).tolist()

    @classmethod
    def run_query(cls, query: str, top_n: int = 5, **params) -> List[Dict[str, Any]]:
        with cls.driver.session() as session:
            results = session.run(query, **params, top_n=top_n)
            products = []
            for record in results:
                product = {
                    "title": record["title"],
                    "description": record["description"],
                    "average_rating": record["average_rating"],
                    "rating_number": record["rating_number"],
                    "price": record["price"],
                    "features": record["features"],
                    "details": record["details"],
                    "image": record["image"],
                    "store": record["store"],
                    "main_category": record["main_category"],
                    "similarity": record["avg_sim"],
                }
                products.append(product)

        return products

    @classmethod
    def search_products_with_query(cls, query: str, top_n: int = 5):
        query_embeddings = cls.generate_embeddings(query)

        query = """
        MATCH (p:Product)-[:SOLD_BY]->(s:Store), (p)-[:IS_OF_CATEGORY]->(c:Category)
        WITH p, s, c, gds.similarity.cosine(p.title_embedding, $query_embeddings) AS title_sim, gds.similarity.cosine(p.description_embedding, $query_embeddings) AS desc_sim
        RETURN p.title AS title, p.description AS description, p.average_rating AS average_rating, p.rating_number AS rating_number, p.price AS price, p.features AS features, p.details AS details, p.image AS image, s.name AS store, c.name AS main_category, (title_sim + desc_sim) / 2 AS avg_sim
        ORDER BY avg_sim DESC
        LIMIT $top_n
        """

        return cls.run_query(
            query,
            query_embeddings=query_embeddings,
            top_n=top_n,
        )

    @classmethod
    def search_products_with_category(
        cls, title: str, description: str, main_category: str, top_n: int = 5
    ):
        title_embedding = cls.generate_embeddings(title)
        description_embedding = cls.generate_embeddings(description)

        query = """
        MATCH (p:Product)-[:SOLD_BY]->(s:Store), (p)-[:IS_OF_CATEGORY]->(c:Category)
        WHERE c.name = $main_category
        WITH p, s, c, gds.similarity.cosine(p.title_embedding, $title_embedding) AS title_sim, gds.similarity.cosine(p.description_embedding, $description_embedding) AS desc_sim
        RETURN p.title AS title, p.description AS description, p.average_rating AS average_rating, p.rating_number AS rating_number, p.price AS price, p.features AS features, p.details AS details, p.image AS image, s.name AS store, c.name AS main_category, (title_sim + desc_sim) / 2 AS avg_sim
        ORDER BY avg_sim DESC
        LIMIT $top_n
        """

        return cls.run_query(
            query,
            title_embedding=title_embedding,
            description_embedding=description_embedding,
            main_category=main_category,
            top_n=top_n,
        )

    @classmethod
    def search_products_with_store(
        cls, title: str, description: str, store: str, top_n: int = 5
    ):
        title_embedding = cls.generate_embeddings(title)
        description_embedding = cls.generate_embeddings(description)

        query = """
        MATCH (p:Product)-[:SOLD_BY]->(s:Store), (p)-[:IS_OF_CATEGORY]->(c:Category)
        WHERE s.name = $store
        WITH p, s, c, gds.similarity.cosine(p.title_embedding, $title_embedding) AS title_sim, gds.similarity.cosine(p.description_embedding, $description_embedding) AS desc_sim
        RETURN p.title AS title, p.description AS description, p.average_rating AS average_rating, p.rating_number AS rating_number, p.price AS price, p.features AS features, p.details AS details, p.image AS image, s.name AS store, c.name AS main_category, (title_sim + desc_sim) / 2 AS avg_sim
        ORDER BY avg_sim DESC
        LIMIT $top_n
        """

        return cls.run_query(
            query,
            title_embedding=title_embedding,
            description_embedding=description_embedding,
            store=store,
            top_n=top_n,
        )

In [14]:
Neo4jSearch.initialize(database="products")

In [19]:
results = Neo4jSearch.search_products_with_query("shampoo", top_n=10)
for result in results:
    print(result)

{'title': 'Hose', 'description': 'hose', 'average_rating': 1.2, 'rating_number': 7, 'price': 34.291356026785714, 'features': "['Package include:' 'High-Quality Material' '10 Functions' 'Wide usage:']", 'details': "{'Package Dimensions': '12.52 x 8.74 x 4.76 inches; 3.97 Pounds', 'Date First Available': 'February 8, 2023', 'Manufacturer': 'Lnxusoqs'}", 'image': 'https://m.media-amazon.com/images/I/01RmK+J4pJL.gif', 'store': 'Lnxusoqs', 'main_category': 'Industrial & Scientific', 'similarity': 0.646843355595473}
{'title': 'Amazon Basics Tear-Free Baby Hair and Body Wash, 16.9 Fluid Ounce, 1-Pack (Previously Solimo)', 'description': 'Amazon Basics Tear-Free Baby Hair and Body Wash, 16.9 Fluid Ounce, 1-Pack (Previously Solimo), Pediatrician and Dermatologist Tested. Hypoallergenic. Mild, Tear-free formula. Delicately scented. Gently cleanses baby skin & hair without causing dryness.', 'average_rating': 4.5, 'rating_number': 3003, 'price': 27.336858080393764, 'features': '[\'One 16.9-fluid 