Setup all required connections and configurations

In [1]:
import os
from neomodel import config, db
from openai import AzureOpenAI


# Configure the connection to the Neo4j database
config.DATABASE_URL = os.getenv("NEO4J_URI")

# check connection
try:
    # Attempt to run a simple query
    db.cypher_query("MATCH (n) RETURN count(n)")
    print("Connection successful!")
except Exception as e:
    print(f"Connection failed: {e}")

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPEN_AI_API_KEY"),
    api_version=os.getenv("AZURE_OPEN_AI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPEN_AI_ENDPOINT"),
)

embedding_model = os.getenv("AZURE_OPEN_AI_EMBEDDING_MODEL")

Connection successful!


Create all required classes

In [2]:
from neomodel import ArrayProperty, FloatProperty, StructuredNode, StringProperty, RelationshipTo, StructuredRel
from pydantic import BaseModel


# Define relationship models
class ReferencesRel(StructuredRel):
    name = StringProperty()
    description = StringProperty()


# Define node models
class Interface(StructuredNode):
    name = StringProperty(unique_index=True)
    summary = StringProperty()
    namespace = StringProperty()
    assembly = StringProperty()
    references = RelationshipTo("Interface", "REFERENCES", model=ReferencesRel)
    embedding = ArrayProperty(FloatProperty())


# define pydantic models
class Metadata(BaseModel):
    name: str
    summary: str
    namespace: str
    assembly: str
    type_references: list[str]

# Main logic: Create the graph

The graph is created by using the scraped interfaces. During the scraping process, all relations between the types are known and stored as ``type_references``. This information is used to create the graph.

In order to *access* the graph using natural language, a [vector index](https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/) is created. This accesses the ``embedding`` of the graph nodes, which are created during the graph creation process.

In [3]:
import json
from typing import Any, Dict, List


def extract_metadata(key: str, value: Dict[str, str]) -> Metadata:
    """
    Extract metadata from the JSON object.
    """
    return Metadata(
        name=key,
        summary=value.get("summary", ""),
        namespace=value.get("namespace", ""),
        assembly=value.get("assembly", ""),
        type_references=value.get("type_references", []),
    )


def create_embedding(content: str) -> List[float] | None:
    """
    Create an embedding for the given content.
    """
    embeddings = client.embeddings.create(input=content, model=embedding_model)
    if len(embeddings.data) > 0:
        return embeddings.data[0].embedding
    return None


def create_graph_from_interfaces(interfaces: Dict[str, Dict[str, str]]):
    """
    Create a graph from the interfaces. The graph will contain nodes for each interface and references between them.
    Also, the nodes will contain embeddings for the interface summaries.
    """
    interface_nodes: Dict[str, Interface] = {}

    # Create interface nodes
    for key, value in interfaces.items():
        metadata = extract_metadata(key, value)
        if metadata.name not in interface_nodes:
            embedding_text = f"{metadata.name}: {metadata.summary}"
            embedding = create_embedding(embedding_text)

            interface_node = Interface(
                name=metadata.name,
                summary=metadata.summary,
                namespace=metadata.namespace,
                assembly=metadata.assembly,
                embedding=embedding,
            ).save()

            interface_nodes[metadata.name] = interface_node

    # Create references between interface nodes
    for key, value in interfaces.items():
        metadata = extract_metadata(key, value)
        source_node = interface_nodes[metadata.name]
        for ref_name in metadata.type_references:
            if (
                ref_name in interface_nodes and ref_name != metadata.name
            ):  # Skip self-references
                target_node = interface_nodes[ref_name]
                if not source_node.references.is_connected(target_node):
                    source_node.references.connect(target_node)


# Load all interfaces from the single JSON file
with open("scraped_domain_knowledge.json", "r", encoding="utf8") as f:
    interfaces: Dict[str, Any] = json.load(f)

create_graph_from_interfaces(interfaces)

Create the vector index

In [4]:
from neomodel import db

query = """
CREATE VECTOR INDEX interface_embeddings  
FOR (n:Interface) ON (n.embedding)  
OPTIONS {indexConfig: { `vector.dimensions`: 3072, `vector.similarity_function`: 'cosine'}}  
"""

# Create vector index
db.cypher_query(query)

([], [])

## Example query

The following query is an example of how to access the graph using natural language. The query is executed using the vector index.

In [11]:
from neomodel import db  
  
def query_similar_interfaces_with_relationships(query_text: str, num_neighbors: int = 10, threshold: float = 0.5):  
    """  
    Query similar interfaces with relationships to related interfaces.  
      
    Parameters:  
    - query_text (str): The text to generate the query embedding.  
    - num_neighbors (int): The number of nearest neighbors to return.  
    - threshold (float): The minimum similarity score required to include a neighbor in the results.  
      
    Returns:  
    - List[Dict]: A list of dictionaries containing the name, summary, related name, and score of similar interfaces.  
    """  
    query_embedding = create_embedding(query_text)  
      
    # Prepare the query  
    cypher_query = """  
    CALL db.index.vector.queryNodes('interface_embeddings', $num_neighbors, $query_embedding)  
    YIELD node AS similarNode, score  
    MATCH (similarNode)-[r:REFERENCES]->(relatedNode)  
    WHERE score >= $threshold  
    RETURN similarNode.name AS name, similarNode.summary AS summary, relatedNode.name AS related_name, score  
    """  
      
    # Execute the query  
    results, meta = db.cypher_query(cypher_query, {  
        'num_neighbors': num_neighbors,  
        'query_embedding': query_embedding,  
        'threshold': threshold  
    })  
      
    # Process the results  
    processed_results = []  
    for result in results:  
        processed_results.append({  
            'name': result[0],  
            'summary': result[1],  
            'related_name': result[2],  
            'score': result[3]  
        })  
      
    return processed_results  
  
# Example usage  
query_text = "Your query text here"  
results = query_similar_interfaces_with_relationships(query_text, threshold=0.6)  
for result in results:  
    print(result)  


{'name': 'ITaxText', 'summary': 'A tax text.', 'related_name': 'ICountry', 'score': 0.6472876071929932}
{'name': 'IWorkshopActivity', 'summary': 'Werkstattdaten - TÃ¤tigkeiten.', 'related_name': 'IWorkshopActivityTireRelation', 'score': 0.6018018126487732}
{'name': 'IWorkshopActivity', 'summary': 'Werkstattdaten - TÃ¤tigkeiten.', 'related_name': 'IWorkshopActivityArticleRelation', 'score': 0.6018018126487732}
{'name': 'IWorkshopActivity', 'summary': 'Werkstattdaten - TÃ¤tigkeiten.', 'related_name': 'IWorkshopActivityTemplate', 'score': 0.6018018126487732}
