In [31]:
from neo4j import GraphDatabase
import pandas as pd
import os
from dotenv import load_dotenv
from string import Template
import string
import re
from embedding import SickEmbedder

load_dotenv()

True

### Define neo4j database

In [53]:
URI = "bolt://localhost:7687"
AUTH = ("neo4j", os.getenv("DB_PASSWORD"))

def clear_db(tx):
    script="""
    MATCH(n)
    DETACH DELETE n;
    """
    tx.run(script)
    script="""
    CALL apoc.cypher.run('DROP INDEX concept_embedding_index', {}) YIELD value RETURN value;
    """
    tx.run(script)
    script="""
    CALL gds.graph.exists('myGraph') YIELD exists
    WITH exists
    CALL apoc.do.when(
    exists,
    'CALL gds.graph.drop("myGraph") YIELD graphName RETURN graphName',
    'RETURN null AS graphName',
    {}
    ) YIELD value
    RETURN value.graphName;
    """
    tx.run(script)

Clear existing nodes.

In [54]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    with driver.session() as session:
        session.execute_write(clear_db)
    driver.close()

ClientError: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `apoc.cypher.run` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}

Create dataframes and embeddings.

In [34]:
df = pd.read_csv("../data/test.csv")
df = pd.concat([df.columns.to_frame().T, df])
df.columns = range(len(df.columns))
embedder = SickEmbedder(dims=1500)
embedding_df = embedder.embed_df(df.copy())

Create nodes from dataframe:

In [35]:
def create_node(tx, title, paragraph, embedding):
    tx.run("""
        MERGE (n:Concept {title: $title})
        SET n.paragraph = $paragraph
        SET n.embedding = $embedding
    """, title=title, paragraph=paragraph, embedding=embedding)

def create_vindex(tx):
    tx.run("""
    CREATE VECTOR INDEX concept_embedding_index
    FOR (n:Concept) ON (n.embedding)
    OPTIONS { indexConfig: {
    `vector.dimensions`: 1500,
    `vector.similarity_function`: "cosine"
    }}
    """)

In [36]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    with driver.session() as session:
        for i in range(df.shape[0]):
            session.execute_write(create_node, df.iloc[i, 0], df.iloc[i, 1], embedding_df.iloc[i, 1])
        session.execute_write(create_vindex)
    driver.close()

Create relationships based on similarities.

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [38]:
def create_relation_df(embedding_df, threshold: float = 0.5, top_k: int = 3):

    embeddings = np.stack(embedding_df.iloc[:, 1])
    similarity_matrix = cosine_similarity(embeddings)

    edges = []

    for i in range(len(embedding_df)):
        sims = similarity_matrix[i]
        top_indices = sims.argsort()[-(top_k+1):-1][::-1]

        for j in top_indices:
            if sims[j] >= threshold:
                edges.append((df.iloc[i, 0], df.iloc[j, 0], sims[j]))

    edges_df = pd.DataFrame(edges, columns=["source", "target", "score"])

    return edges_df

In [39]:
def create_edges(tx, source, target, score):
    tx.run("""
    MATCH (a:Concept {title: $source}), (b:Concept {title: $target})
    MERGE (a)-[:SIMILAR_TO {score: $score}]->(b)
    """, source=source, target=target, score=score)

In [40]:
print(create_relation_df(embedding_df).head())

                            source                           target     score
0    Chapter One: Human activities  The concept of human activities  0.628288
1    Chapter One: Human activities        Types of human activities  0.622383
2    Chapter One: Human activities   Importance of human activities  0.561784
3  The concept of human activities    Chapter One: Human activities  0.628288
4  The concept of human activities              Primary activities:  0.627811


In [41]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    with driver.session() as session:
        for _, row in create_relation_df(embedding_df).iterrows():
            session.execute_write(
                create_edges,
                row['source'], row['target'], float(row['score'])
            )
    driver.close()

In [42]:
def create_catalog(tx):
    tx.run("""
    MATCH (source:Concept)-[r:SIMILAR_TO]->(target:Concept)
    RETURN gds.graph.project(
    'myGraph',
    source,
    target,
    {
        relationshipProperties: r { .score }
    },
    { undirectedRelationshipTypes: ['*'] }
    )
    """)

In [43]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    with driver.session() as session:
        for _, row in create_relation_df(embedding_df).iterrows():
            session.execute_write(
                create_catalog
            )
    driver.close()

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke function `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: Graph myGraph already exists}

In [45]:
def leiden_grouping(tx):
    tx.run("""
    CALL gds.leiden.stream('myGraph', { randomSeed: 19 })
    YIELD nodeId, communityId
    RETURN gds.util.asNode(nodeId).title AS title, communityId
    ORDER BY title ASC
    """)

In [46]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    with driver.session() as session:
        for _, row in create_relation_df(embedding_df).iterrows():
            session.execute_write(
                leiden_grouping
            )
    driver.close()