# From Graph Basics to Vector Search: A Comprehensive Guide 

Import our usual suspects (and some more...)

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from neo4j import Query, GraphDatabase, RoutingControl, Result
import google.generativeai as genai  # Add this line for Gemini
# Remove or comment out the OpenAI imports
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.output_parsers import StrOutputParser
import matplotlib.pyplot as plt
import seaborn as sns


  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
env_file = '.env'

In [3]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI - Updated for Gemini
    GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
    if GOOGLE_API_KEY:
        import google.generativeai as genai
        genai.configure(api_key=GOOGLE_API_KEY)
    MODEL = os.getenv('MODEL', 'gemini-pro')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDING_MODEL', 'models/embedding-001')
else:
    print(f"File {env_file} not found.")

In [4]:
url = "https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/data/expanded_skills.csv"

In [5]:
skills_df = pd.read_csv(url)

In [6]:
skills_df.describe()

Unnamed: 0,email,name,skills
count,100,100,100
unique,100,100,100
top,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
freq,1,1,1


In [7]:
skills_df.head(10)

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
1,lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux"
2,richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps"
3,amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure"
4,david.hill@test.org,David Hill,"Java, Scrum, Angular"
5,christopher.johnson@test.org,Christopher Johnson,"Tableau, Flask, API Design"
6,amelia.martin@test.org,Amelia Martin,"CI/CD, Kotlin, HTML5, TensorFlow"
7,daniel.hill@test.org,Daniel Hill,"System Design, Git, Cypher, Pandas, Spring Boot"
8,alice.white@test.org,Alice White,"Spark, Agile, JavaScript"
9,lucy.taylor@test.org,Lucy Taylor,"Flask, Tableau, CI/CD, Rust, System Design"


In [8]:
# Convert skills column from comma separated string to List
skills_df['skills'] = skills_df['skills'].str.split(', ')
skills_df.head()

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"[Security, Pandas, Go]"
1,lucy.clark@test.org,Lucy Clark,"[WordPress, Scrum, Go, SQL, Linux]"
2,richard.jackson@test.org,Richard Jackson,"[System Design, PyTorch, Express.js, DevOps]"
3,amelia.hall@test.org,Amelia Hall,"[Agile, CSS3, R, Azure]"
4,david.hill@test.org,David Hill,"[Java, Scrum, Angular]"


## Create the Graph in Neo4j

In [179]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

In case we want to split large files. 

In [180]:
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

Test the connection

In [181]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,154


### Set constraints

We know what we will be loading. Set some constrainst first. Documentation: [Constraints](https://neo4j.com/docs/cypher-manual/current/constraints/managing-constraints/)

Set the constraint on Person Nodes

In [182]:
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.email) IS NODE KEY',
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x15a397550>, keys=[])

Set the constraint on Skill Nodes

Fetch all constraints

In [183]:
schema_result_df  = driver.execute_query(
    'SHOW CONSTRAINTS',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head()

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,5,constraint_ab43e77d,UNIQUENESS,NODE,[Skill],[name],constraint_ab43e77d,
1,7,constraint_d3bfd313,NODE_KEY,NODE,[Person],[email],constraint_d3bfd313,


### Load (:Person)-[:KNOWS]->(:Skill)

Create a Person and Skills nodes and create a relationship in between. Documentation: [MERGE](https://neo4j.com/docs/cypher-manual/current/clauses/merge/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjwm7q-BhDRARIsACD6-fXns_MSgSZ3_jQdYreKu3iOBQQU6bwddlNa4wD12oLr3rxKUlF4MMMaAnj1EALw_wcB)

In [184]:
for chunk in split_dataframe(skills_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p:Person{email:row.email})
        SET p.name = row.name
        WITH p, row
        FOREACH(skill IN row.skills | MERGE (s:Skill{name:skill}) MERGE (p)-[:KNOWS]->(s) )
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

## Explore the Graph

Now to the database and observe what is there. 
Example queries: 
- MATCH (n:Person) RETURN n LIMIT 25;
- MATCH (n:Skill) RETURN n LIMIT 25;
- MATCH p=()-[:KNOWS]->() RETURN p LIMIT 25;

We can also run this via the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). Let's do so below

#### What persons are in the database?

In [185]:
persons_df = driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.name AS person_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [186]:
persons_df

Unnamed: 0,person_name
0,Ryan Jones
1,Matthew Miller
2,Andrew King
3,Amelia Davis
4,Emily Phillips
...,...
95,Andrew Martin
96,Harper Wright
97,William Rodriguez
98,Emily Garcia


#### What skills does each person know?

In [187]:
person_skills_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN p.email AS email, p.name AS person_name, collect(s.name) AS skills
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [188]:
person_skills_df

Unnamed: 0,email,person_name,skills
0,amelia.davis@test.org,Amelia Davis,"[Security, PyTorch, Java, HTML5, Docker]"
1,emily.phillips@test.org,Emily Phillips,"[Security, Vue.js, PHP, Kubernetes, Data Visua..."
2,john.garcia@test.org,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]"
3,james.anderson@test.org,James Anderson,"[Security, R, JavaScript, Node.js]"
4,matthew.moore@test.org,Matthew Moore,"[Security, TensorFlow, Spring Boot, Swift]"
...,...,...,...
95,alice.hill@test.org,Alice Hill,"[Spring Boot, Blockchain, Cloud Architecture]"
96,charles.carter@test.org,Charles Carter,"[Spark, JavaScript, Docker]"
97,james.carter@test.org,James Carter,"[TypeScript, Jenkins, Project Management]"
98,emily.garcia@test.org,Emily Garcia,"[TypeScript, Testing, Data Visualization]"


#### What are the most frequent skills?

In [189]:
skill_count_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN s.name, COUNT(DISTINCT p) AS knownByCount ORDER BY knownByCount DESC LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [190]:
skill_count_df

Unnamed: 0,s.name,knownByCount
0,System Design,14
1,Agile,13
2,Security,13
3,Angular,13
4,Cloud Architecture,11
5,Scrum,11
6,Blockchain,11
7,Docker,10
8,TensorFlow,10
9,ReactJS,10


#### Multihop question

Run the following query in the database: 
- ```MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person) RETURN DISTINCT p;```
- ```MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill) RETURN DISTINCT p;```

In [191]:
person_name_1 = "Lucy Clark"

persons_with_shared_skills_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: $person_name_1})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    RETURN DISTINCT p2.name as person;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [192]:
persons_with_shared_skills_df

Unnamed: 0,person
0,Natalie Brown
1,Thomas Nelson
2,Natalie Miller
3,Peter Martinez
4,Ryan Nelson
5,Robert Davis
6,John Johnson
7,Ryan Young
8,Mia Nelson
9,David Lopez


In [193]:
person_name_1 = "Lucy Clark"

skills_two_steps_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: $person_name_1})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill)
    RETURN DISTINCT s2.name as skill;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [194]:
skills_two_steps_df

Unnamed: 0,skill
0,Pandas
1,CSS3
2,Project Management
3,Kubernetes
4,Security
5,Express.js
6,Azure
7,Testing
8,Machine Learning
9,Docker


## Person Similarity

We can define the similarity of persons based on the number of skills that are overlapping. 

In [195]:
person_name_1 = "Thomas Brown"

similar_skills_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person{name: $person_name_1})-[:KNOWS]->(s1:Skill)
    MATCH path_2=(s1)<-[:KNOWS]-(p2:Person)
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count > 1 AND person_1 <> person_2
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [196]:
similar_skills_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Thomas Brown,Amelia Davis,3,"[Security, Java, Docker]"
1,Thomas Brown,James Anderson,2,"[Security, R]"
2,Thomas Brown,Lucy Turner,2,"[Security, Docker]"
3,Thomas Brown,Andrew Martin,2,"[R, Java]"
4,Thomas Brown,Thomas Garcia,2,"[Java, Docker]"


In [197]:
similar_skills_all_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person)-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    WHERE p1.name < p2.name
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count >= 1
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [198]:
similar_skills_all_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Amelia Davis,John Garcia,3,"[Security, PyTorch, HTML5]"
1,Amelia Davis,Thomas Brown,3,"[Security, Java, Docker]"
2,Charles Jones,John Taylor,3,"[Pandas, CSS3, AWS]"
3,David Rodriguez,Matthew Scott,3,"[Scrum, Azure, Cypher]"
4,Joseph Martin,Kevin Young,3,"[Linux, Agile, ReactJS]"
...,...,...,...,...
1314,Emily Phillips,Lucy Roberts,1,[Data Visualization]
1315,Andrew Anderson,Lucy Roberts,1,[Data Visualization]
1316,Isabella Martin,Lucy Roberts,1,[Data Visualization]
1317,Emily Garcia,Lucy Roberts,1,[Data Visualization]


Load the skill count to the database in a new relationship

In [199]:
for chunk in split_dataframe(similar_skills_all_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p1:Person{name:row.person_1})
        MERGE (p2:Person{name:row.person_2})
        MERGE (p1)-[s:SIMILAR_SKILLSET]->(p2)
        SET s.overlap = row.skill_count
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

Take a minute to explore the SIMILAR_SKILLSET network in the database. 

- ```MATCH p=()-[:SIMILAR_SKILLSET]->() RETURN p LIMIT 50```
- ```MATCH p=()-[s:SIMILAR_SKILLSET]->() WHERE s.overlap >= 2 RETURN p LIMIT 50```
- ```MATCH p=()-[s:SIMILAR_SKILLSET]->() WHERE s.overlap >= 3 RETURN p LIMIT 50```

## Semantic Similar skill

Since the communities don't really make sense (due to the randomness of the skills for persons) we can try the similarity based on the semantic meaning. 

In [200]:
skills_df = driver.execute_query(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [201]:
skills_df.head(5)

Unnamed: 0,skill
0,Security
1,Pandas
2,Go
3,WordPress
4,Scrum


### STOP STOP STOP - DO NOT PROCEED (YET)

-- Only to be run by instructor (or if you have your own api key). Skip the following two cells -- 

In [202]:
# skills_df['embedding'] = skills_df['description'].apply( lambda skill: embeddings.embed_documents([skill])[0])
# skills_df.head()

In [203]:
# gds.run_cypher('''
#     unwind $data as row
#     match (s:Skill{name: row.skill})
#     set s.embedding = row.embedding
#     ''',
#     params = { 'data': skills_df.to_dict(orient='records') }
# )

In [204]:
url = 'https://raw.githubusercontent.com/neo4j-product-examples/genai-workshop/refs/heads/main/talent/data/skills_embeddings.csv'

In [205]:
skills_embeddings_df = pd.read_csv(url)

In [206]:
skills_embeddings_df.head()

Unnamed: 0,Skill,Description,Embedding
0,API Design,API Design is the process of creating applicat...,"[0.007902550511062145, -0.006266295909881592, ..."
1,AWS,Amazon Web Services (AWS) is a comprehensive a...,"[-0.0029347320087254047, -0.015877487137913704..."
2,Agile,Agile is a dynamic and flexible project manage...,"[-0.023693757131695747, -0.012672649696469307,..."
3,Angular,"Angular is a powerful, open-source web applica...","[0.013188531622290611, 0.029791485518217087, -..."
4,Azure,Azure is Microsoft's cloud computing platform ...,"[-0.004470727406442165, -0.024176467210054398,..."


In [207]:
type(skills_embeddings_df['Embedding'].iloc[0][0])

str

In [208]:
skills_embeddings_df['Embedding'] = skills_embeddings_df['Embedding'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )

In [209]:
type(skills_embeddings_df['Embedding'].iloc[0][0])

float

In [210]:
skills_embeddings_df.head()

Unnamed: 0,Skill,Description,Embedding
0,API Design,API Design is the process of creating applicat...,"[0.007902550511062145, -0.006266295909881592, ..."
1,AWS,Amazon Web Services (AWS) is a comprehensive a...,"[-0.0029347320087254047, -0.015877487137913704..."
2,Agile,Agile is a dynamic and flexible project manage...,"[-0.023693757131695747, -0.012672649696469307,..."
3,Angular,"Angular is a powerful, open-source web applica...","[0.013188531622290611, 0.029791485518217087, -..."
4,Azure,Azure is Microsoft's cloud computing platform ...,"[-0.004470727406442165, -0.024176467210054398,..."


Length of an embedding

In [211]:
len(skills_embeddings_df['Embedding'].iloc[0])

1536

### Add Embeddings to the database

Add embeddings with the description to Skill nodes in database

In [212]:
for chunk in split_dataframe(skills_embeddings_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MATCH (s:Skill{name: row.Skill})
        SET s.embedding = row.Embedding
        SET s.description = row.Description
        WITH s
        CALL db.create.setNodeVectorProperty(s, "embedding", s.embedding)
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

Let's have a look in the browser! 

### Vectors for Semantic Meaning

In [213]:
driver.execute_query(
    """
    CREATE VECTOR INDEX `skill-embeddings` IF NOT EXISTS
    FOR (s:Skill) ON (s.embedding)
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'
        } 
    }
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)      

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x14083f340>, keys=[])

In [None]:
### MATCH p=()-[:SIMILAR_SEMANTIC] ->() RETURN p

In [214]:
indexes_result_df  = driver.execute_query(
    'SHOW INDEXES',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
indexes_result_df

Unnamed: 0,id,name,state,populationPercent,type,entityType,labelsOrTypes,properties,indexProvider,owningConstraint,lastRead,readCount
0,4,constraint_ab43e77d,ONLINE,100.0,RANGE,NODE,[Skill],[name],range-1.0,constraint_ab43e77d,2025-08-01T17:10:37.019000000+00:00,2334
1,6,constraint_d3bfd313,ONLINE,100.0,RANGE,NODE,[Person],[email],range-1.0,constraint_d3bfd313,2025-08-01T17:10:32.845000000+00:00,500
2,0,index_343aff4e,ONLINE,100.0,LOOKUP,NODE,,,token-lookup-1.0,,2025-08-01T17:10:35.217000000+00:00,5492
3,1,index_f7700477,ONLINE,100.0,LOOKUP,RELATIONSHIP,,,token-lookup-1.0,,2025-08-01T15:11:26.211000000+00:00,2
4,2,skill-embeddings,ONLINE,100.0,VECTOR,NODE,[Skill],[embedding],vector-2.0,,2025-08-01T17:10:37.020000000+00:00,56


### Semantic Search

Take some Skill and find relevant other Skills: "Python", "Java", "Git", "CI/CD", "AWS", "Data Visualization", "Power BI", "R"". 

In [215]:
skill_search = "Python"

In [216]:
similar_skills_df  = driver.execute_query(
    """
    MATCH (s:Skill{name: $skill_search})
    CALL db.index.vector.queryNodes("skill-embeddings", 10, s.embedding) YIELD node, score
    WITH node as skill, score ORDER BY score DESC
    WHERE node.name <> s.name AND score > 0.9
    RETURN skill.name, score
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skill_search = skill_search

)
similar_skills_df

Unnamed: 0,skill.name,score
0,Ruby,0.929993
1,Java,0.92485
2,PHP,0.924561
3,Pandas,0.923492
4,C++,0.920074
5,Django,0.918076
6,JavaScript,0.917648
7,PyTorch,0.911591
8,Scala,0.908844


We can also find similarity from other terms than the skills in the database now. 

Some suggestions to search for: 
- data visualizations and dashboards
- deployments
- API coding
- Machine Learning frameworks
- Cloud expertise

In [217]:
skill_search = "API coding"

In [223]:
''' driver.execute_query(
    CALL db.index.vector.queryNodes("skill-embeddings", 10, $query_vector) YIELD node, score
    WHERE score > 0.89
    RETURN node.name AS skill, score
    ,
    database_ = DATABASE,
    routing_ = RoutingControl.READ,
    result_transformer_ = lambda r: r.to_df(),
    query_vector = embeddings.embed_query(skill_search)
) ''' 

' driver.execute_query(\n    CALL db.index.vector.queryNodes("skill-embeddings", 10, $query_vector) YIELD node, score\n    WHERE score > 0.89\n    RETURN node.name AS skill, score\n    ,\n    database_ = DATABASE,\n    routing_ = RoutingControl.READ,\n    result_transformer_ = lambda r: r.to_df(),\n    query_vector = embeddings.embed_query(skill_search)\n) '

In [233]:
# --- Add this as a new cell after your Neo4j driver setup ---

# 1. First, drop the existing index if it exists
print("Dropping existing index...")
driver.execute_query(
    "DROP INDEX `skill-embeddings` IF EXISTS",
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

# 2. Create new vector index with 768 dimensions for Gemini
print("Creating new vector index...")
driver.execute_query("""
    CREATE VECTOR INDEX `skill-embeddings`
    FOR (s:Skill) ON (s.embedding)
    OPTIONS {
      indexConfig: {
        `vector.dimensions`: 768,
        `vector.similarity_function`: 'cosine'
      }
    }
""", database_=DATABASE, routing_=RoutingControl.WRITE)

print("Vector index is ready for 768-dimensional Gemini embeddings")

Dropping existing index...
Creating new vector index...
Vector index is ready for 768-dimensional Gemini embeddings


In [234]:
# --- Google Gemini Embedding Function ---
import google.generativeai as genai

# Configure your API key
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'models/embedding-001')

def embed_query_text(text_to_embed):
    """Generates an embedding for a text string using Gemini."""
    try:
        response = genai.embed_content(
            model=EMBEDDING_MODEL,
            content=text_to_embed,
            task_type="retrieval_query"
        )
        return response['embedding']
    except Exception as e:
        print(f"❌ Error generating embedding: {str(e)}")
        return None

In [235]:
### not working

skill_search = "API coding"

# Step 1: Generate the vector for your search query using Gemini
query_vector = embed_query_text(skill_search)

# Step 2: Pass that vector to the Cypher query
driver.execute_query(
    '''
    CALL db.index.vector.queryNodes("skill-embeddings", 10, $query_vector) YIELD node, score
    WHERE score > 0.89
    RETURN node.name AS skill, score
    ''',
    database_ = DATABASE,
    routing_ = RoutingControl.READ,
    result_transformer_ = lambda r: r.to_df(),
    query_vector = query_vector
)

Unnamed: 0,skill,score


Create relationship for similar sematic skills

In [236]:
driver.execute_query(
    """
    CALL apoc.periodic.iterate(
        "MATCH (skill1:Skill) RETURN skill1",
        "WITH skill1 
        CALL db.index.vector.queryNodes('skill-embeddings', 10, skill1.embedding) YIELD node, score
        WITH skill1, node as skill2, score ORDER BY score DESC
        WHERE skill1.name < skill2.name AND score > 0.92
        MERGE (skill1)-[s:SIMILAR_SEMANTIC]->(skill2)
        SET s.score = score   
        ",
        {batchSize: 1000}
    )
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
0,1,54,0,0,54,1,0,"{'Index query vector has 1536 dimensions, but ...","{'total': 1, 'errors': {'org.neo4j.graphdb.Que...","{'total': 54, 'errors': {'Index query vector h...",False,{},"{'relationshipsDeleted': 0, 'relationshipsCrea..."


Let's look in the browser how these relationships look like. 

```MATCH p=()-[:SIMILAR_SEMANTIC]->() RETURN p```

What are similar skills in the database now? 

In [237]:
similar_skills_df  = driver.execute_query(
    """
    MATCH (s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)
    WHERE s1.name < s2.name
    RETURN s1.name AS skill1, r.score AS score, s2.name AS skill2
    ORDER BY score DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [238]:
similar_skills_df

Unnamed: 0,skill1,score,skill2
0,Data Visualization,0.951431,Tableau
1,Data Analysis,0.947662,Data Visualization
2,AWS,0.946609,Azure
3,Agile,0.942825,Scrum
4,CI/CD,0.941895,DevOps
5,ReactJS,0.939651,Vue.js
6,Power BI,0.937012,Tableau
7,Express.js,0.936569,Node.js
8,CI/CD,0.935562,Jenkins
9,Docker,0.935333,Kubernetes


## Now we can find more people with based on Semantic Similarity

Check the following in the browser:
```
MATCH (p1:Person {name: "John Garcia"})-[:KNOWS]->(s:Skill)
WITH p1, COLLECT(s.name) as skills_1
CALL (p1, p1){
  MATCH p=(p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)
  RETURN p
  UNION 
  MATCH (p1)-[r:SIMILAR_SKILLSET]->(p2:Person), p=(p2)-[:KNOWS]->(:Skill)
  RETURN p
}
RETURN p
```

The following persons give some interesting results: "Amelia Davis", "Victoria Thomas", "John Walker"

In [239]:
person_name_1 = "John Garcia"

In [240]:
similar_persons_df  = driver.execute_query(
    """
    MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)
    WITH p1, COLLECT(s.name) as skills_1
    CALL (p1){
      MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)
      RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score
      UNION 
      MATCH (p1)-[r:SIMILAR_SKILLSET]->(p2:Person)
      RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score
    }
    WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score
    WHERE score >= 1
    MATCH (person_2)-[:KNOWS]->(s:Skill)
    RETURN person_1, skills_1,  person_2.name as person_2, COLLECT(s.name) as skills_2, score
    ORDER BY score DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [241]:
similar_persons_df

Unnamed: 0,person_1,skills_1,person_2,skills_2,score
0,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Matthew Miller,"[TensorFlow, Ruby, AWS, ReactJS]",2.93338
1,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Matthew Mitchell,"[R, HTML5, Blockchain, Cloud Architecture, Ruby]",2.93161
2,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",John Johnson,"[WordPress, TensorFlow, AWS, Project Managemen...",2.863373
3,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",John Taylor,"[Pandas, Scrum, CSS3, Ruby, AWS]",2.0
4,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Mia Nelson,"[Security, WordPress, Big Data, Swift, AWS]",2.0
5,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Matthew Moore,"[Security, TensorFlow, Spring Boot, Swift]",1.93338
6,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Emily Thompson,"[Scrum, TensorFlow, Cloud Architecture, ReactJS]",1.86499
7,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Ryan Young,"[WordPress, Blockchain, Cloud Architecture, Py...",1.861603
8,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Steven Jones,"[HTML5, Big Data, Docker, Blockchain]",1.0
9,John Garcia,"[Security, PyTorch, HTML5, Ruby, AWS]",Thomas Miller,"[HTML5, Git, Big Data]",1.0


Check in the browser the following: 

```
MATCH p=(p1:Person {name: "John Garcia"})-[:KNOWS]->(s:Skill)-[:SIMILAR_SEMANTIC]->(:Skill)<-[:KNOWS]-(p2:Person{name:"Matthew Miller"})
RETURN p 
UNION 
MATCH p=(p1:Person {name: "John Garcia"})-[:KNOWS]->(s:Skill)<-[:KNOWS]-(p2:Person{name:"Matthew Miller"})
RETURN p
```

```
MATCH p=(p1:Person {name: "John Garcia"})-[:KNOWS]->(s:Skill)-[:SIMILAR_SEMANTIC*0..2]->(:Skill)<-[:KNOWS]-(p2:Person{name:"Matthew Miller"})
RETURN p 
UNION 
MATCH p=(p1:Person {name: "John Garcia"})-[:KNOWS]->(s:Skill)<-[:KNOWS]-(p2:Person{name:"Matthew Miller"})
RETURN p
```

Calculate for all of them with score > 3

In [246]:
similar_persons_df = driver.execute_query(
    """
    MATCH (p1:Person)-[:KNOWS]->(s:Skill)
    WITH p1, COLLECT(s.name) AS skills_1
    CALL {
      WITH p1
      MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)
      RETURN p1 AS person_1, p2 AS person_2, SUM(r.score) AS score
      
      UNION ALL
      
      WITH p1
      MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)
      RETURN p1 AS person_1, p2 AS person_2, SUM(r.overlap) AS score
    }
    WITH person_1, skills_1, person_2, SUM(score) AS score
    WHERE score > 3
    MATCH (person_2)-[:KNOWS]->(s:Skill)
    RETURN person_1.name AS person_1, skills_1, person_2.name AS person_2, 
           COLLECT(s.name) AS skills_2, score
    ORDER BY score DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_=lambda r: r.to_df()
)



In [247]:
similar_persons_df

Unnamed: 0,person_1,skills_1,person_2,skills_2,score
0,Natalie Thompson,"[System Design, Angular, Spark, TypeScript, Je...",Ryan Jones,"[Angular, Spark, Jenkins, PHP, Project Managem...",3.930054
1,Ryan Jones,"[Angular, Spark, Jenkins, PHP, Project Managem...",Natalie Thompson,"[System Design, Angular, Spark, TypeScript, Je...",3.930054
2,Joseph Martin,"[Linux, Agile, Java, Power BI, ReactJS]",Kevin Young,"[Linux, Agile, TensorFlow, ReactJS, C++]",3.925827
3,Kevin Young,"[Linux, Agile, TensorFlow, ReactJS, C++]",Joseph Martin,"[Linux, Agile, Java, Power BI, ReactJS]",3.925827
4,James Anderson,"[Security, R, JavaScript, Node.js]",Andrew Martin,"[R, Java, Cloud Architecture, Testing, Node.js]",3.866257
5,Oliver Bennett,"[R, JavaScript, Testing, Jenkins]",Andrew Martin,"[R, Java, Cloud Architecture, Testing, Node.js]",3.866257
6,Andrew Martin,"[R, Java, Cloud Architecture, Testing, Node.js]",James Anderson,"[Security, R, JavaScript, Node.js]",3.866257
7,Andrew Martin,"[R, Java, Cloud Architecture, Testing, Node.js]",Oliver Bennett,"[R, JavaScript, Testing, Jenkins]",3.866257
8,Natalie Thompson,"[System Design, Angular, Spark, TypeScript, Je...",Lucy Roberts,"[System Design, Angular, CI/CD, Swift, Data Vi...",3.865616
9,Lucy Roberts,"[System Design, Angular, CI/CD, Swift, Data Vi...",Natalie Thompson,"[System Design, Angular, Spark, TypeScript, Je...",3.865616
