In [1]:
!pip install neo4j faiss-cpu transformers torch numpy

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)


In [2]:
from neo4j import GraphDatabase
import numpy as np
import random
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

In [4]:
#neo4j configs
NEO4J_URI=""
NEO4J_USERNAME=""
NEO4J_PASSWORD=""

In [5]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [6]:
#get nodes and relationships
def fetch_graph_data():
    with driver.session() as session:
        result = session.run("MATCH (p:person) RETURN p.oid AS oid, p.name AS name")
        nodes = [{"oid": record["oid"], "name": record["name"]} for record in result]

    # Fetch relationships (e.g., 'USES_DEVICE' between person and device)
    with driver.session() as session:
        result = session.run("MATCH (p:person)-[r:USES_DEVICE]->(d:device) RETURN p.oid AS person_oid, d.oid AS device_oid, r.starttime AS starttime, r.endtime AS endtime")
        relationships = [{"person_oid": record["person_oid"], "device_oid": record["device_oid"], "starttime": record["starttime"], "endtime": record["endtime"]} for record in result]

    return nodes, relationships

nodes, relationships = fetch_graph_data()
print(f"Nodes: {len(nodes)}, Relationships: {len(relationships)}")


Nodes: 4, Relationships: 10000


In [7]:
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record.data() for record in result]

# node labels
node_labels = run_query("CALL db.labels()")

# relationship types
rel_types = run_query("CALL db.relationshipTypes()")

# node property keys by label
node_props = run_query("""
MATCH (n)
WITH labels(n) AS label, keys(n) AS props
UNWIND label AS l
RETURN DISTINCT l AS node_label, collect(DISTINCT props) AS property_keys
LIMIT 100
""")

# relationship property keys
rel_props = run_query("""
MATCH ()-[r]->()
RETURN DISTINCT type(r) AS rel_type, collect(DISTINCT keys(r)) AS property_keys
LIMIT 100
""")

print("Node Labels:")
for item in node_labels:
    print(item)

print("\nRelationship Types:")
for item in rel_types:
    print(item)

print("\nNode Properties:")
for item in node_props:
    print(item)

print("\nRelationship Properties:")
for item in rel_props:
    print(item)


Node Labels:
{'label': 'person'}
{'label': 'device'}
{'label': 'phonenumber'}
{'label': 'celltower'}

Relationship Types:
{'relationshipType': 'USES_DEVICE'}
{'relationshipType': 'CALLS'}
{'relationshipType': 'CONNECTED_TO'}

Node Properties:
{'node_label': 'person', 'property_keys': [['oid', 'activities']]}
{'node_label': 'device', 'property_keys': [['oid']]}
{'node_label': 'phonenumber', 'property_keys': [['oid']]}
{'node_label': 'celltower', 'property_keys': [['oid']]}

Relationship Properties:
{'rel_type': 'USES_DEVICE', 'property_keys': [['endtime', 'starttime']]}
{'rel_type': 'CALLS', 'property_keys': [['endtime', 'starttime', 'duration']]}
{'rel_type': 'CONNECTED_TO', 'property_keys': [['endtime', 'starttime']]}


In [8]:
def classify_intent_with_llm(query):
    prompt = f"""
You are an intent classification model for a graph database with this schema:

Node labels:
- person(oid, activities)
- device(oid)
- phonenumber(oid)
- celltower(oid)

Relationship types:
- USES_DEVICE(starttime, endtime)
- CALLS(starttime, endtime, duration)
- CONNECTED_TO(starttime, endtime)

Identify the intent behind the query from the following categories:
- get_devices_by_person
- get_calls_by_person
- get_activities_by_person
- get_phonenumbers_by_person
- get_activities_by_celltower
- get_calls_between_people

Only return the intent name.

Query: "{query}"
Intent:
"""
    from transformers import pipeline
    classifier = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    result = classifier(prompt, max_new_tokens=10)[0]['generated_text']
    return result.strip().split("Intent:")[-1].strip()


In [9]:
query = "how many phone number did person 32 had"
intent = classify_intent_with_llm(query)
print(f"Identified intent: {intent}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Device set to use cuda:0


Identified intent: - get_devices_by_person(


In [10]:
#generating data for Spacy - custom NER training

persons = [f"person {i}" for i in range(1, 50)]
devices = [f"device {i}" for i in range(1, 20)]
phonenumbers = [f"phonenumber {i}" for i in range(100, 150)]
celltowers = [f"celltower {i}" for i in range(20, 40)]
dates = [f"2004-09-{str(i).zfill(2)}" for i in range(1, 30)]

templates = [
    ("Which devices did {person} use?", "PERSON"),
    ("What is the phone number used by {person}?", "PERSON"),
    ("Show me calls between {phone1} and {phone2}", ("PHONENUMBER", "PHONENUMBER")),
    ("List activities for {person}", "PERSON"),
    ("Which celltower was used at {date}?", "CELLTOWER"),
    ("Who used {device}?", "DEVICE"),
    ("Which phone numbers were connected to {celltower}?", "CELLTOWER"),
]

TRAIN_DATA = []

for _ in range(200):
    template, label = random.choice(templates)
    if label == "PERSON":
        p = random.choice(persons)
        sentence = template.format(person=p)
        start = sentence.find(p)
        end = start + len(p)
        TRAIN_DATA.append((sentence, {"entities": [(start, end, "PERSON")]}))

    elif label == "DEVICE":
        d = random.choice(devices)
        sentence = template.format(device=d)
        start = sentence.find(d)
        end = start + len(d)
        TRAIN_DATA.append((sentence, {"entities": [(start, end, "DEVICE")]}))

    elif label == "CELLTOWER":
        c = random.choice(celltowers)
        sentence = template.format(celltower=c, date=random.choice(dates))
        start = sentence.find(c)
        end = start + len(c)
        TRAIN_DATA.append((sentence, {"entities": [(start, end, "CELLTOWER")]}))

    elif isinstance(label, tuple) and label[0] == "PHONENUMBER":
        p1 = random.choice(phonenumbers)
        p2 = random.choice([x for x in phonenumbers if x != p1])
        sentence = template.format(phone1=p1, phone2=p2)
        s1, e1 = sentence.find(p1), sentence.find(p1) + len(p1)
        s2, e2 = sentence.find(p2), sentence.find(p2) + len(p2)
        TRAIN_DATA.append((sentence, {"entities": [(s1, e1, "PHONENUMBER"), (s2, e2, "PHONENUMBER")]}))

In [11]:
TRAIN_DATA[:3]

[('List activities for person 2', {'entities': [(20, 28, 'PERSON')]}),
 ('What is the phone number used by person 20?',
  {'entities': [(33, 42, 'PERSON')]}),
 ('Show me calls between phonenumber 105 and phonenumber 121',
  {'entities': [(22, 37, 'PHONENUMBER'), (42, 57, 'PHONENUMBER')]})]

In [16]:
#Training custom NER model
nlp = spacy.blank("en")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")


for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.3, losses=losses)
        print(f"Iteration {itn+1} - Losses: {losses}")

output_dir = "custom_ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Iteration 1 - Losses: {'ner': np.float32(270.77023)}
Iteration 2 - Losses: {'ner': np.float32(16.261087)}
Iteration 3 - Losses: {'ner': np.float32(0.25170982)}
Iteration 4 - Losses: {'ner': np.float32(0.17620875)}
Iteration 5 - Losses: {'ner': np.float32(0.0016202566)}
Iteration 6 - Losses: {'ner': np.float32(5.369323e-05)}
Iteration 7 - Losses: {'ner': np.float32(2.6668426e-06)}
Iteration 8 - Losses: {'ner': np.float32(9.05726)}
Iteration 9 - Losses: {'ner': np.float32(0.00040915608)}
Iteration 10 - Losses: {'ner': np.float32(0.02383003)}
Model saved to custom_ner_model


In [17]:
#Checking entity extraction from custom ner model
nlp = spacy.load("custom_ner_model")
text = "Which devices did person 123 use?"
doc = nlp(text)
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Entities: [('person 123', 'PERSON')]


In [18]:
text = "is phonenumber 2233 is from device nokia connected to celltower 26"
doc = nlp(text)
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Entities: [('phonenumber 2233', 'PHONENUMBER'), ('device nokia', 'DEVICE'), ('celltower 26', 'CELLTOWER')]


In [19]:
#Embed graph data from neo4j using sentence transformer
#Triplets - (subject, predicate, object)
#neo4j - (node1)-[relationship]->(node2)
model = SentenceTransformer("all-MiniLM-L6-v2")

triplets = []
metadata = []

def build_sentence(row, rel_type):
    if rel_type == "USES_DEVICE":
        return f"person {row['person']} used device {row['device']} from {row['starttime']} to {row['endtime']}"
    elif rel_type == "CALLS":
        return f"phonenumber {row['src']} called phonenumber {row['dst']} from {row['starttime']} to {row['endtime']} lasting {row['duration']} seconds"
    elif rel_type == "CONNECTED_TO":
        return f"phonenumber {row['phonenumber']} connected to celltower {row['celltower']} from {row['starttime']} to {row['endtime']}"
    else:
        return ""

with driver.session() as session:

    query1 = """
        MATCH (p:person)-[r:USES_DEVICE]->(d:device)
        RETURN p.oid AS person, d.oid AS device, r.starttime AS starttime, r.endtime AS endtime
    """
    for row in session.run(query1):
        sent = build_sentence(row, "USES_DEVICE")
        triplets.append(sent)
        metadata.append(dict(row))

    query2 = """
        MATCH (a:phonenumber)-[r:CALLS]->(b:phonenumber)
        RETURN a.oid AS src, b.oid AS dst, r.starttime AS starttime, r.endtime AS endtime, r.duration AS duration
    """
    for row in session.run(query2):
        sent = build_sentence(row, "CALLS")
        triplets.append(sent)
        metadata.append(dict(row))

    query3 = """
        MATCH (p:phonenumber)-[r:CONNECTED_TO]->(c:celltower)
        RETURN p.oid AS phonenumber, c.oid AS celltower, r.starttime AS starttime, r.endtime AS endtime
    """
    for row in session.run(query3):
        sent = build_sentence(row, "CONNECTED_TO")
        triplets.append(sent)
        metadata.append(dict(row))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
embeddings = model.encode(triplets, convert_to_numpy=True)

#faiss index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

faiss.write_index(index, "graph_triplets.index")
np.save("triplet_texts.npy", triplets)

#Saving embeddings
np.save("graph_embeddings.npy", embeddings)
with open("graph_metadata.json", "w") as f:
    import json
    json.dump(metadata, f)

print("Embedding complete. Total triplets:", len(triplets))

Embedding complete. Total triplets: 10000


In [21]:
#Load TinyLlama
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

#Load SentenceTransformer for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#Load FAISS index and triplets
faiss_index = faiss.read_index("graph_triplets.index")
with open("triplet_texts.npy", "rb") as f:
    triplet_texts = np.load(f, allow_pickle=True).tolist()

In [22]:
def get_query_embedding(query):
    return embedding_model.encode([query]).astype("float32")

In [23]:
def search_faiss_index(query_embedding, top_k=10, entity_filter=None):
    distances, indices = faiss_index.search(query_embedding, top_k)
    triplets = [triplet_texts[i] for i in indices[0]]

    if entity_filter:
        extended_filters = set()
        for label, value in entity_filter.items() if isinstance(entity_filter, dict) else []:
            extended_filters.add(str(value).lower())
            extended_filters.add(f"{label.lower()} {value}".lower())
        if not extended_filters and isinstance(entity_filter, list):
            extended_filters = set(str(v).lower() for v in entity_filter)

        def relevance_score(triplet):
            score = 0
            triplet_lower = triplet.lower()
            for f in extended_filters:
                if f in triplet_lower:
                    score += 2 if ' ' in f else 1
            return score

        scored = sorted(triplets, key=relevance_score, reverse=True)
        return scored[:top_k]

    return triplets

In [24]:
def extract_entities(text):
    doc = nlp(text)
    entity_dict = {}
    for ent in doc.ents:
        if ent.label_ not in entity_dict:
            entity_dict[ent.label_] = ent.text.strip()
        else:
            if isinstance(entity_dict[ent.label_], list):
                entity_dict[ent.label_].append(ent.text.strip())
            else:
                entity_dict[ent.label_] = [entity_dict[ent.label_], ent.text.strip()]
    return entity_dict

In [37]:
#generate cypher query - using few shot examples
def generate_cypher_query(query, intent, entities, triplets):
    # Few-shot examples
    examples = """
Example 1:
Query: Which devices did person 123 use?
Intent: get_devices_by_person
Entities: PERSON=123
Graph context:
- person 123 used device 10 from 2004-09-01 to 2004-09-10
- person 123 used device 12 from 2004-09-11 to 2004-09-12
Cypher:
MATCH (p:person {oid: '123'})-[:USES_DEVICE]->(d:device)
RETURN d

Example 2:
Query: What phone numbers did person 456 call?
Intent: get_calls_by_person
Entities: PERSON=456
Graph context:
- person 456 called phonenumber 789 from 2004-09-01 to 2004-09-10
Cypher:
MATCH (p:person {oid: '456'})-[:CALLS]->(ph:phonenumber)
RETURN ph

Example 3:
Query: What activities are logged for person 789?
Intent: get_activities_by_person
Entities: PERSON=789
Graph context:
- person 789 used device 3 from 2004-09-05 to 2004-09-08
Cypher:
MATCH (p:person {oid: '789'})
RETURN p.activities

Example 4:
Query: How many devices did person 94 use?
Intent: count_devices_by_person
Entities: PERSON=94
Graph context:
- person 94 used device 5 from 2004-09-01 to 2004-09-10
Cypher:
MATCH (p:person {oid: 94})-[:USES_DEVICE]->(d:device)
RETURN COUNT(d) AS device_count
"""

    facts = "\n".join(f"- {t}" for t in triplets[:5])
    normalized_entities = {k: v.split()[-1] for k, v in entities.items()}
    entity_context = ", ".join([f"{k}={v}" for k, v in normalized_entities.items()])

    prompt = f"""
{examples}

### NEW QUERY ###
Query: {query}
Intent: {intent}
Entities: {entity_context}
Graph context:
{facts}
Cypher:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
        #do_sample=False,
        temperature=0.5,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Cypher:" in decoded:
        decoded = decoded.split("Cypher:")[-1].strip()

    cypher_lines = decoded.splitlines()
    for i, line in enumerate(cypher_lines):
        if "MATCH" in line:
            return "\n".join(cypher_lines[i:]).strip()

    return decoded.strip()


In [28]:
query = "what devices were used by person 94"

In [29]:
intent = classify_intent_with_llm(query)
intent

Device set to use cuda:0


'- get_devices_by_person('

In [30]:
entities = extract_entities(query)
entities

{'PERSON': 'person 94'}

In [31]:
query_embedding = get_query_embedding(query)
#query_embedding

In [32]:
triplet_context = search_faiss_index(query_embedding, top_k=10,entity_filter=entities)
triplet_context

['person 94 used device 93 from 2004-11-01T12:42:42 to 2004-11-01T12:42:42',
 'person 94 used device 93 from 2004-07-23T16:12:42 to 2004-07-23T16:19:41',
 'person 94 used device 238 from 2004-10-11T18:32:29 to 2004-10-11T18:32:29',
 'person 94 used device 90 from 2004-11-16T22:26:18 to 2004-11-16T22:26:18',
 'person 94 used device 90 from 2004-09-29T19:38:16 to 2004-09-29T19:48:43',
 'person 94 used device 90 from 2004-11-18T19:09:22 to 2004-11-18T19:19:48',
 'person 94 used device 66 from 2004-11-18T17:37:22 to 2004-11-18T17:37:22',
 'person 94 used device 162 from 2004-12-20T09:19:20 to 2004-12-20T09:19:20',
 'person 94 used device 93 from 2004-07-23T16:40:11 to 2004-07-23T17:44:48',
 'person 94 used device 93 from 2004-07-23T14:24:11 to 2004-07-23T15:55:44']

In [38]:
cypher_query = generate_cypher_query(query, intent, entities, triplet_context)

In [39]:
print(f"\nGenerated Cypher:\n{cypher_query}")


Generated Cypher:
MATCH (p:person {oid: '94'})-[:USES_DEVICE]->(d:device)
RETURN d
```

In this example, the `get_devices_by_person` query is used to retrieve the devices used by person 94. The `get_devices_by_person` query returns a list of devices, which can be used to filter the results of the `get_calls_by
