In [2]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [7]:
# Load from environment
load_dotenv(override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

In [8]:
# Connect to the knowledge graph instance using LangChain
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

### Create a vector index

In [9]:
kg.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
  FOR (m:Movie) ON (m.taglineEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: 384,
    `vector.similarity_function`: 'cosine'
  }}"""
)

[]

In [10]:
kg.query(
"""
SHOW VECTOR INDEXES
"""
)

[{'id': 3,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-3.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

### Populate the vector index

In [11]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"

hf_embeddings = HuggingFaceEndpointEmbeddings(
    model=model_name,
    task="feature-extraction",
    huggingfacehub_api_token=os.getenv("HUGGINGFACE_API_KEY"),
)

In [None]:
# easy with openai

# kg.query("""
#     MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
#     WITH movie, genai.vector.encode(
#         movie.tagline, 
#         "OpenAI", 
#         {
#           token: $openAiApiKey,
#           endpoint: $openAiEndpoint
#         }) AS vector
#     CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [None]:
# i want to try with open source models (so HF embeddings)

# Get all movies with taglines
movies = kg.query("""
    MATCH (movie:Movie) 
    WHERE movie.tagline IS NOT NULL
    RETURN movie.tagline AS tagline, ID(movie) AS id
""")

# Generate embeddings for all taglines
taglines = [m['tagline'] for m in movies]
embeddings = hf_embeddings.embed_documents(taglines)

# Store embeddings back in Neo4j
for movie, embedding in zip(movies, embeddings):
    kg.query("""
        MATCH (movie:Movie) 
        WHERE ID(movie) = $movie_id
        CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", $embedding)
        """,
        params={"movie_id": movie['id'], "embedding": embedding}
    )



In [16]:
len(embeddings)

37

In [18]:
result = kg.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [19]:
result[0]['m.tagline']

'Welcome to the Real World'

In [20]:
result[0]['m.taglineEmbedding']

[0.004916847217828035,
 -0.0370774008333683,
 -0.05302993580698967,
 0.08803059160709381,
 0.002610360039398074,
 -0.04897091165184975,
 0.010284018702805042,
 -0.020684707909822464,
 0.04854820668697357,
 0.03980860859155655,
 0.14016038179397583,
 0.009561889804899693,
 -0.011948537081480026,
 0.02574753202497959,
 -0.100343719124794,
 -0.06849749386310577,
 0.06089349091053009,
 -0.09545785933732986,
 -0.12708042562007904,
 -0.008842960931360722,
 0.0013119467766955495,
 -0.025083083659410477,
 -0.03771349415183067,
 0.06817325204610825,
 -0.006272147875279188,
 0.019774923101067543,
 -0.006198229733854532,
 0.02108805812895298,
 -0.03142925351858139,
 -0.045716773718595505,
 -0.022434985265135765,
 0.06367078423500061,
 0.0712144747376442,
 -0.03672400861978531,
 -0.0282735712826252,
 -0.009561496786773205,
 0.03462420031428337,
 0.004063514061272144,
 0.0007250744383782148,
 -0.03857341781258583,
 0.02384009212255478,
 -0.07178183645009995,
 -0.01787034049630165,
 -0.0052545042708

In [21]:
len(result[0]['m.taglineEmbedding'])

384

### Semantic Similarity

In [22]:
question = "What movies are about love?"

In [None]:
# for openai

# kg.query("""
#     WITH genai.vector.encode(
#         $question, 
#         "OpenAI", 
#         {
#           token: $openAiApiKey,
#           endpoint: $openAiEndpoint
#         }) AS question_embedding
#     CALL db.index.vector.queryNodes(
#         'movie_tagline_embeddings', 
#         $top_k, 
#         question_embedding
#         ) YIELD node AS movie, score
#     RETURN movie.title, movie.tagline, score
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY,
#             "openAiEndpoint": OPENAI_ENDPOINT,
#             "question": question,
#             "top_k": 5
#             })

In [24]:
question_embedding = hf_embeddings.embed_query(question)

# Use the embedding in Neo4j vector search
results = kg.query("""
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings', 
        $top_k, 
        $question_embedding
    ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={
        "question_embedding": question_embedding,
        "top_k": 5
    }
)

In [25]:
results

[{'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.7580915689468384},
 {'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.7532861232757568},
 {'movie.title': 'Snow Falling on Cedars',
  'movie.tagline': 'First loves last. Forever.',
  'score': 0.7330302000045776},
 {'movie.title': 'When Harry Met Sally',
  'movie.tagline': 'Can two friends sleep together and still love each other in the morning?',
  'score': 0.6664345264434814},
 {'movie.title': "You've Got Mail",
  'movie.tagline': 'At odds in life... in love on-line.',
  'score': 0.6634907722473145}]