In [1]:
# Step 2.3: Install required packages
%pip install neo4j
%pip install rdflib 
%pip install pandas
%pip install sentence-transformers 
%pip install faiss-cpu 
%pip install python-dotenv
%pip install -U langchain langchain-openai langchainhub langchain-community langchain-experimental 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[

In [2]:
# Step 3.2: Set up imports
import os
import rdflib
from rdflib.namespace import RDF, RDFS, OWL
import pandas as pd
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import textwrap

# Load env vars from the file used in previous chapters
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
# Neo4j setup
NEO4J_URI = os.getenv('NEO4J_URI', 'neo4j://127.0.0.1:7687')
NEO4J_USER = os.getenv('NEO4J_USER', 'neo4j')
NEO4J_PASS =os.getenv('NEO4J_PASS', 'password')

# LLM setup
CHAT_MODEL = "gpt-4o-mini"
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0.2)
# Turn off hosted LangSmith tracing (optional: silences that warning)
os.environ["LANGCHAIN_TRACING_V2"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Step 3.3: Load the ontology
g = rdflib.Graph()
g.parse('FinancialOntology.ttl', format='turtle')

# Helper to get first value of a given property
def get_first(subject, prop):
    for val in g.objects(subject, prop):
        return str(val)
    return None

# --- Collect nodes ---
nodes = []
for s in g.subjects(RDF.type, OWL.Class):
    nodes.append({
        'id': str(s),
        'label': get_first(s, RDFS.label) or s.split('#')[-1],
        'comment': get_first(s, RDFS.comment),
        'type': 'Class'
    })
for s in g.subjects(RDF.type, OWL.NamedIndividual):
    nodes.append({
        'id': str(s),
        'label': get_first(s, RDFS.label) or s.split('#')[-1],
        'comment': get_first(s, RDFS.comment),
        'type': 'Individual'
    })
nodes_df = pd.DataFrame(nodes)
nodes_df.to_csv('ontology_nodes.csv', index=False)

# --- Collect edges ---
edges = []
for s, p, o in g.triples((None, None, None)):
    if p in [RDF.type, RDFS.label, RDFS.comment]:
        continue
    if str(p).startswith('http://www.w3.org/2002/07/owl#'):
        continue
    if isinstance(o, rdflib.term.Identifier) and str(o).startswith('http'):
        edges.append({
            'source': str(s),
            'target': str(o),
            'type': p.split('#')[-1] if '#' in str(p) else str(p).split('/')[-1]
        })
edges_df = pd.DataFrame(edges)
edges_df.to_csv('ontology_edges.csv', index=False)

print("Created ontology_nodes.csv and ontology_edges.csv")

data_rows = []
for s, p, o in g.triples((None, None, None)):
    # Keep only literal values (data properties)
    if isinstance(o, rdflib.term.Literal):
        prop_name = p.split('#')[-1] if '#' in str(p) else str(p).rstrip('/').split('/')[-1]
        # capture datatype if present
        dtype = str(o.datatype) if o.datatype else None
        data_rows.append({
            'subject': str(s),
            'property': prop_name,
            'value': str(o),
            'datatype': dtype
        })

pd.DataFrame(data_rows).to_csv('ontology_data.csv', index=False)
print("Created ontology_data.csv")


Created ontology_nodes.csv and ontology_edges.csv
Created ontology_data.csv


In [4]:
# Step 4.1: Use credentials loaded in previous cells
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

def run_tx(query, params=None):
    with driver.session() as session:
        return session.run(query, params or {}).consume()

print(f"Connected to Neo4j at {NEO4J_URI} as user {NEO4J_USER}")

Connected to Neo4j at neo4j://127.0.0.1:7687 as user neo4j


In [5]:
# Step 4.2: Create schema constraint
run_tx("""
CREATE CONSTRAINT resource_id_unique IF NOT EXISTS
FOR (n:Resource) REQUIRE n.id IS UNIQUE
""")
print("Constraint ensured: (:Resource {id}) is UNIQUE.")


Constraint ensured: (:Resource {id}) is UNIQUE.


In [6]:
# Step 4.3: Load nodes from ontology_nodes.csv
nodes_df = pd.read_csv("ontology_nodes.csv")
print(nodes_df.head())

# MERGE all nodes as :Resource; store label/comment/type for later use
node_query = """
MERGE (n:Resource {id: $id})
SET n.rdfs_label = $rdfs_label,
    n.comment    = $comment,
    n.kind       = $kind   // 'Class' or 'Individual'
"""

with driver.session() as session:
    for rec in nodes_df.to_dict(orient="records"):
        params = {
            "id": rec["id"],
            "rdfs_label": rec.get("label"),
            "comment": rec.get("comment"),
            "kind": rec.get("type")
        }
        session.run(node_query, params)
print(f"Imported {len(nodes_df)} nodes as :Resource.")

                                                  id  \
0  http://www.semanticweb.org/keithbourne/ontolog...   
1  http://www.semanticweb.org/keithbourne/ontolog...   
2  http://www.semanticweb.org/keithbourne/ontolog...   
3  http://www.semanticweb.org/keithbourne/ontolog...   
4  http://www.semanticweb.org/keithbourne/ontolog...   

                                               label  comment   type  
0  http://www.semanticweb.org/keithbourne/ontolog...      NaN  Class  
1                                               Bond      NaN  Class  
2  http://www.semanticweb.org/keithbourne/ontolog...      NaN  Class  
3  http://www.semanticweb.org/keithbourne/ontolog...      NaN  Class  
4  http://www.semanticweb.org/keithbourne/ontolog...      NaN  Class  
Imported 17 nodes as :Resource.


In [7]:
# Step 4.4: Load edges from ontology_edges.csv 

edges_df = pd.read_csv("ontology_edges.csv")
print(edges_df.head())

rel_map = {
    "issuedBy": "ISSUED_BY",
    "isRegulatedBy": "IS_REGULATED_BY",
    "ownedBy": "OWNED_BY",
}

query_issued = """
MATCH (a:Resource {id: $src}), (b:Resource {id: $tgt})
MERGE (a)-[:ISSUED_BY]->(b)
"""

query_regulated = """
MATCH (a:Resource {id: $src}), (b:Resource {id: $tgt})
MERGE (a)-[:IS_REGULATED_BY]->(b)
"""

query_owned = """
MATCH (a:Resource {id: $src}), (b:Resource {id: $tgt})
MERGE (a)-[:OWNED_BY]->(b)
"""

with driver.session() as session:
    count = 0
    for rec in edges_df.to_dict(orient="records"):
        t = str(rec.get("type", "")).strip()
        src = rec.get("source")
        tgt = rec.get("target")
        if t not in rel_map:
            continue  # skip edges we aren't modeling here
        if t == "issuedBy":
            session.run(query_issued, {"src": src, "tgt": tgt})
        elif t == "isRegulatedBy":
            session.run(query_regulated, {"src": src, "tgt": tgt})
        elif t == "ownedBy":
            session.run(query_owned, {"src": src, "tgt": tgt})
        count += 1
print(f"Imported {count} relationships (ISSUED_BY, IS_REGULATED_BY, OWNED_BY).")


                                              source  \
0  http://www.semanticweb.org/keithbourne/ontolog...   
1  http://www.semanticweb.org/keithbourne/ontolog...   
2  http://www.semanticweb.org/keithbourne/ontolog...   
3  http://www.semanticweb.org/keithbourne/ontolog...   
4  http://www.semanticweb.org/keithbourne/ontolog...   

                                              target        type  
0  http://www.semanticweb.org/keithbourne/ontolog...    issuedBy  
1  http://www.semanticweb.org/keithbourne/ontolog...    issuedBy  
2  http://www.semanticweb.org/keithbourne/ontolog...  subClassOf  
3  http://www.semanticweb.org/keithbourne/ontolog...      domain  
4  http://www.semanticweb.org/keithbourne/ontolog...       range  
Imported 5 relationships (ISSUED_BY, IS_REGULATED_BY, OWNED_BY).


In [8]:
# step 4.5: Load data properties from ontology_data.csv

try:
    data_df = pd.read_csv("ontology_data.csv")
except FileNotFoundError:
    data_df = pd.DataFrame(columns=["subject","property","value","datatype"])

print(data_df.head())

# Only apply properties we care about in this lab (hasTicker)
query_has_ticker = """
MATCH (n:Resource {id: $id})
SET n.hasTicker = $val
"""

with driver.session() as session:
    tick_count = 0
    for rec in data_df.to_dict(orient="records"):
        prop = str(rec.get("property", "")).strip()
        if prop != "hasTicker":
            continue
        session.run(query_has_ticker, {"id": rec.get("subject"), "val": rec.get("value")})
        tick_count += 1

print(f"Set hasTicker on {tick_count} nodes.")

                                             subject     property  \
0  http://www.semanticweb.org/keithbourne/ontolog...   definition   
1  http://www.semanticweb.org/keithbourne/ontolog...    scopeNote   
2  http://www.semanticweb.org/keithbourne/ontolog...     altLabel   
3  http://www.semanticweb.org/keithbourne/ontolog...   definition   
4  http://www.semanticweb.org/keithbourne/ontolog...  hiddenLabel   

                                               value  datatype  
0  A security representing equity ownership in a ...       NaN  
1  Includes fixed-income securities with defined ...       NaN  
2                                      Bond Security       NaN  
3  A debt security issued by governments or corpo...       NaN  
4                                       equity share       NaN  
Set hasTicker on 3 nodes.


In [9]:
# step 4.6: Find anything that looks like a stock-ish resource with a ticker
with driver.session() as session:
    result = session.run("""
        MATCH (n:Resource)
        WHERE n.hasTicker IS NOT NULL
        RETURN n.rdfs_label AS label, n.hasTicker AS ticker, n.id AS id
        ORDER BY label
    """)
    rows = result.data()
rows

[{'label': 'AAPL',
  'ticker': 'AAPL',
  'id': 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/AAPL'},
 {'label': 'MSFT',
  'ticker': 'MSFT',
  'id': 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/MSFT'},
 {'label': 'USTB',
  'ticker': 'USTB',
  'id': 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/USTB'}]

In [10]:
# Step 5.1: Build rdf:type (class membership) edges from the Turtle file and push to Neo4j
type_pairs = []  # (individual_iri, class_iri)

# We only want membership of individuals to classes (not "is a Class" or "is a NamedIndividual")
for s, _, o in g.triples((None, RDF.type, None)):
    # skip ontology meta (i.e., don't create type edges for the classes themselves)
    if o in (OWL.Class, OWL.NamedIndividual):
        continue
    # many ontologies also mark classes with RDF.type RDFS.Class
    if o == RDFS.Class:
        continue
    # keep only cases where subject looks like an IRI and object looks like a class IRI present in our graph
    if isinstance(s, rdflib.term.Identifier) and isinstance(o, rdflib.term.Identifier):
        type_pairs.append((str(s), str(o)))

len(type_pairs)

12

In [11]:
# Step 5.1 (second code section): Push [:IS_A] edges into Neo4j between the resource (individual) and the class node
with driver.session() as session:
    q = """
    MATCH (a:Resource {id: $sid}), (cls:Resource {id: $cid})
    MERGE (a)-[:IS_A]->(cls)
    """
    count = 0
    for sid, cid in type_pairs:
        session.run(q, {"sid": sid, "cid": cid})
        count += 1
count

12

In [12]:
# Step 5.2: Create “All X” concept nodes and wire members

anchors = [
    {"name": "All Stocks",  "class_label": "Stock"},
    {"name": "All Bonds",   "class_label": "Bond"},
    {"name": "All Orgs",    "class_label": "Organization"},
    {"name": "All Regulators", "class_label": "RegulatoryAuthority"},
    # add more as your ontology grows
]

with driver.session() as session:
    for a in anchors:
        # ensure the anchor concept exists
        session.run("""
            MERGE (c:Concept {name: $name})
            ON CREATE SET c.description = $desc
        """, {
            "name": a["name"],
            "desc": f"Anchor node that includes all {a['class_label']} members."
        })
        # clear old INCLUDES from this concept (idempotent refresh)
        session.run("""
            MATCH (c:Concept {name: $name})-[r:INCLUDES]->()
            DELETE r
        """, {"name": a["name"]})
        # connect anchor to all members of the class via IS_A
        session.run("""
            MATCH (c:Concept {name: $name})
            MATCH (cls:Resource {rdfs_label: $class_label})
            MATCH (n:Resource)-[:IS_A]->(cls)
            MERGE (c)-[:INCLUDES]->(n)
        """, {"name": a["name"], "class_label": a["class_label"]})

print("Anchors created/updated and INCLUDES relationships established.")

Anchors created/updated and INCLUDES relationships established.


In [13]:
# Step 5.3 (part 1): Count included members per anchor
with driver.session() as session:
    data = session.run("""
        MATCH (c:Concept)-[:INCLUDES]->(n:Resource)
        RETURN c.name AS anchor, count(n) AS members
        ORDER BY anchor
    """).data()
data

[{'anchor': 'All Bonds', 'members': 1}, {'anchor': 'All Stocks', 'members': 2}]

In [14]:
# Step 5.3 (part 2): Peek at a few Stock members
with driver.session() as session:
    data = session.run("""
        MATCH (:Concept {name:'All Stocks'})-[:INCLUDES]->(n:Resource)
        RETURN n.rdfs_label AS label, n.hasTicker AS ticker
        ORDER BY label
        LIMIT 10
    """).data()
data

[{'label': 'AAPL', 'ticker': 'AAPL'}, {'label': 'MSFT', 'ticker': 'MSFT'}]

In [15]:
# Step 6.1: Build enriched hybrid text (with multi-hop)
def fetch_hybrid_text_for_class(class_label: str):
    """
    Returns a DataFrame with: id, class_label, hybridText
    for all :Resource nodes that are IS_A the given class_label,
    enriched with multi-hop info.
    """
    cypher = """
    MATCH (n:Resource)-[:IS_A]->(cls:Resource {rdfs_label: $class_label})
    OPTIONAL MATCH (n)-[:ISSUED_BY]->(org:Resource)
    OPTIONAL MATCH (n)-[:IS_REGULATED_BY]->(reg:Resource)
    OPTIONAL MATCH (org)-[:IS_REGULATED_BY]->(orgreg:Resource)
    WITH n, cls, org, reg, orgreg
    RETURN
      n.id AS id,
      cls.rdfs_label AS class_label,
      (
        coalesce(cls.rdfs_label, '') + ' ' +
        coalesce(n.rdfs_label, '') +
        CASE WHEN n.hasTicker IS NOT NULL THEN ' [' + n.hasTicker + ']' ELSE '' END +
        CASE WHEN org.rdfs_label IS NOT NULL THEN ' issued by ' + org.rdfs_label ELSE '' END +
        CASE WHEN reg.rdfs_label IS NOT NULL THEN ' regulated by ' + reg.rdfs_label ELSE '' END +
        CASE WHEN n.comment    IS NOT NULL THEN ' — ' + n.comment ELSE '' END +
        CASE WHEN orgreg.rdfs_label IS NOT NULL THEN '. Issuer regulated by ' + orgreg.rdfs_label ELSE '' END
      ) AS hybridText
    ORDER BY n.rdfs_label
    """
    with driver.session() as session:
        rows = session.run(cypher, {"class_label": class_label}).data()
    return pd.DataFrame(rows)

# Example: build for Stocks and Bonds
stocks_df = fetch_hybrid_text_for_class("Stock")
bonds_df  = fetch_hybrid_text_for_class("Bond")

display(stocks_df.head())
display(bonds_df.head())

Unnamed: 0,id,class_label,hybridText
0,http://www.semanticweb.org/keithbourne/ontolog...,Stock,Stock AAPL [AAPL] issued by http://www.semanti...
1,http://www.semanticweb.org/keithbourne/ontolog...,Stock,Stock MSFT [MSFT] issued by http://www.semanti...


Unnamed: 0,id,class_label,hybridText
0,http://www.semanticweb.org/keithbourne/ontolog...,Bond,Bond USTB [USTB] issued by Victory Capital — NaN


In [16]:
# Step 6.2: Build enriched hybrid text (with multi-hop)
orgs_df  = fetch_hybrid_text_for_class("Organization")
regs_df  = fetch_hybrid_text_for_class("RegulatoryAuthority")

hybrid_df = pd.concat([stocks_df, bonds_df, orgs_df, regs_df], ignore_index=True)
print(f"Total hybrid text entries: {len(hybrid_df)}")
hybrid_df.head(10)

Total hybrid text entries: 3


Unnamed: 0,id,class_label,hybridText
0,http://www.semanticweb.org/keithbourne/ontolog...,Stock,Stock AAPL [AAPL] issued by http://www.semanti...
1,http://www.semanticweb.org/keithbourne/ontolog...,Stock,Stock MSFT [MSFT] issued by http://www.semanti...
2,http://www.semanticweb.org/keithbourne/ontolog...,Bond,Bond USTB [USTB] issued by Victory Capital — NaN


In [17]:
# Step 6.3: Export for embedding service
out_path = "hybrid_embeddings_input.csv"
hybrid_df.to_csv(out_path, index=False)
print(f"Wrote {len(hybrid_df)} rows to {out_path}")

Wrote 3 rows to hybrid_embeddings_input.csv


In [18]:
# Step 6.4: embeddings + vectorstore (one place only) ---
# local, no-API embedder that returns normalized vectors
class STEmbeddings(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.st = SentenceTransformer(model_name)
    def embed_documents(self, texts):
        return self.st.encode(texts, convert_to_numpy=True, normalize_embeddings=True).tolist()
    def embed_query(self, text):
        return self.st.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0].tolist()

embedder = STEmbeddings()

# Build documents from your hybrid_df (from Step 6.1 / 6.2)
docs = [
    Document(
        page_content=row.hybridText,
        metadata={"id": row.id, "class_label": row.class_label}
    )
    for row in hybrid_df.itertuples(index=False)
]

# Let LangChain build and hold FAISS; no direct `import faiss`
vectorstore = FAISS.from_documents(docs, embedder)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [19]:
# Step 6.5 (updated): semantic search helper + graph expansion using LangChain FAISS
def search_hybrid(query: str, top_k: int = 5) -> pd.DataFrame:
    """
    Use the LangChain FAISS vectorstore to search the hybrid text.
    Returns a DataFrame with: id, class_label, hybridText, score
    (score is the distance from FAISS; lower is better)
    """
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    rows = []
    for doc, score in results:
        rows.append({
            "id": doc.metadata.get("id"),
            "class_label": doc.metadata.get("class_label"),
            "hybridText": doc.page_content,
            "score": score
        })
    return pd.DataFrame(rows)

def expand_in_graph(ids, depth: int = 2):
    """
    Given a list of Neo4j node IDs, fetch a small neighborhood.
    (Uses the 'driver' from earlier steps.)
    """
    with driver.session() as session:
        data = session.run(f"""
            MATCH (n:Resource)
            WHERE n.id IN $ids
            OPTIONAL MATCH p=(n)-[*1..{depth}]-(m:Resource)
            WITH n, collect(distinct m) AS nbrs
            RETURN n.id AS id,
                   n.rdfs_label AS label,
                   n.hasTicker AS ticker,
                   n.comment AS comment,
                   [x IN nbrs | coalesce(x.rdfs_label, x.id)] AS neighbors
            ORDER BY label
        """, {"ids": ids}).data()
    return data

In [20]:
# Step 6.6: Example natural-language queries to test:
queries = [
    "equities regulated by the SEC",
    "Apple stock",
    "government bonds",
]

for q in queries:
    print(f"\nQuery: {q}")
    hits = search_hybrid(q, top_k=5)
    display(hits[["class_label","id","hybridText","score"]].head(5))
    # Expand top 3 in the graph
    top_ids = hits["id"].head(3).tolist()
    ctx = expand_in_graph(top_ids, depth=2)
    print("Graph context (top 3):")
    for row in ctx:
        print(f"- {row['label']} [{row.get('ticker')}] :: neighbors={row['neighbors'][:6]}")


Query: equities regulated by the SEC


Unnamed: 0,class_label,id,hybridText,score
0,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock MSFT [MSFT] issued by http://www.semanti...,1.010578
1,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock AAPL [AAPL] issued by http://www.semanti...,1.057379
2,Bond,http://www.semanticweb.org/keithbourne/ontolog...,Bond USTB [USTB] issued by Victory Capital — NaN,1.634952


Graph context (top 3):
- AAPL [AAPL] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Apple_Inc', 'Stock', 'MSFT', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- MSFT [MSFT] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Microsoft_Corp', 'Stock', 'AAPL', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- USTB [USTB] :: neighbors=['Victory Capital', 'Bond', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization']

Query: Apple stoc

Unnamed: 0,class_label,id,hybridText,score
0,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock AAPL [AAPL] issued by http://www.semanti...,0.890892
1,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock MSFT [MSFT] issued by http://www.semanti...,1.267464
2,Bond,http://www.semanticweb.org/keithbourne/ontolog...,Bond USTB [USTB] issued by Victory Capital — NaN,1.641043


Graph context (top 3):
- AAPL [AAPL] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Apple_Inc', 'Stock', 'MSFT', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- MSFT [MSFT] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Microsoft_Corp', 'Stock', 'AAPL', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- USTB [USTB] :: neighbors=['Victory Capital', 'Bond', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization']

Query: government

Unnamed: 0,class_label,id,hybridText,score
0,Bond,http://www.semanticweb.org/keithbourne/ontolog...,Bond USTB [USTB] issued by Victory Capital — NaN,1.019285
1,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock AAPL [AAPL] issued by http://www.semanti...,1.5522
2,Stock,http://www.semanticweb.org/keithbourne/ontolog...,Stock MSFT [MSFT] issued by http://www.semanti...,1.566432


Graph context (top 3):
- AAPL [AAPL] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Apple_Inc', 'Stock', 'MSFT', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- MSFT [MSFT] :: neighbors=['http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Microsoft_Corp', 'Stock', 'AAPL', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority']
- USTB [USTB] :: neighbors=['Victory Capital', 'Bond', 'http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization']


In [None]:
# Step 7.1: Helper: vector search + graph expansion (LangChain FAISS)

import pandas as pd
from typing import List, Dict, Any

def vector_search(query: str, top_k: int = 5) -> pd.DataFrame:
    """
    Semantic search over the LangChain FAISS vectorstore built in Step 6.
    Returns a DataFrame with columns: id, class_label, hybridText, score
    """
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    rows = []
    for doc, score in results:
        rows.append({
            "id": doc.metadata.get("id"),
            "class_label": doc.metadata.get("class_label"),
            "hybridText": doc.page_content,
            "score": float(score),
        })
    return pd.DataFrame(rows)

def graph_expand(ids: List[str], depth: int = 2) -> List[Dict[str, Any]]:
    """
    For each node id, collect a compact neighborhood and key properties.
    """
    with driver.session() as session:
        data = session.run(f"""
            MATCH (n:Resource)
            WHERE n.id IN $ids
            OPTIONAL MATCH p=(n)-[*1..{depth}]-(m:Resource)
            WITH n, collect(distinct m) AS nbrs
            RETURN n.id AS id,
                   n.rdfs_label AS label,
                   n.hasTicker AS ticker,
                   n.comment AS comment,
                   [x IN nbrs | coalesce(x.rdfs_label, x.id)] AS neighbors
            ORDER BY label
        """, {"ids": ids}).data()
    return data

def build_llm_context(query: str, top_k: int = 5, depth: int = 2, max_neighbors: int = 8) -> Dict[str, Any]:
    """
    End-to-end: vector search -> graph expand -> promptable context block.
    Returns dict with 'context', 'hits', and 'citations'.
    """
    hits = vector_search(query, top_k=top_k)

    # Expand top hits in the graph
    ids = hits["id"].tolist()
    expanded = graph_expand(ids, depth=depth)
    exp_by_id = {row["id"]: row for row in expanded}

    # Build a readable context block with lightweight citations
    lines, citations = [], []
    for i, row in enumerate(hits.to_dict(orient="records"), start=1):
        rid = row["id"]
        meta = exp_by_id.get(rid, {})
        label = meta.get("label") or rid
        ticker = meta.get("ticker")
        comment = meta.get("comment")
        neighbors = (meta.get("neighbors") or [])[:max_neighbors]
        hybrid_text = row.get("hybridText", "").strip()

        cite = f"[E{i}]"
        citations.append({"key": cite, "id": rid, "label": label})

        entry = f"{cite} {hybrid_text}"
        if ticker:
            entry += f" (ticker: {ticker})"
        if comment:
            entry += f"\n    note: {comment}"
        if neighbors:
            entry += "\n    related: " + ", ".join(neighbors)
        lines.append(entry)

    context_header = f"Query: {query}\nTop {len(lines)} relevant entities and facts:\n"
    context_body = "\n\n".join(lines)
    context = context_header + context_body

    return {"context": context, "hits": hits, "citations": citations}


In [None]:
# Step 7.2: Example usage (show output)
query = "equities regulated by the SEC"
bundle = build_llm_context(query, top_k=5, depth=2, max_neighbors=8)

print(textwrap.dedent(bundle["context"]))
print("\nCitations:")
for c in bundle["citations"]:
    print(f"  {c['key']} -> {c['label']} ({c['id']})")


Query: equities regulated by the SEC
Top 3 relevant entities and facts:
[E1] Stock MSFT [MSFT] issued by http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Microsoft_Corp regulated by http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC — NaN (ticker: MSFT)
    note: nan
    related: http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC, http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Microsoft_Corp, Stock, AAPL, http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Organization, http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/RegulatoryAuthority

[E2] Stock AAPL [AAPL] issued by http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/Apple_Inc regulated by http://www.semanticweb.org/keithbourne/ontologies/2025/7/FinancialOntology/SEC — NaN (ticker: AAPL)
    note: nan
    related: http://www.semanticweb.org/keithbourne

In [23]:
# Step 7.3: Return a prompt template
chat_prompt = ChatPromptTemplate.from_template(
    "You are a precise assistant. Use only the context below. "
    "Cite entities by their bracketed keys (e.g., [E1]).\n\n"
    "Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer (with citations):"
)

In [24]:
# Step 8.2: Demo queries
def make_context(question: str):
    bundle = build_llm_context(question, top_k=5, depth=2, max_neighbors=8)
    return {"context": bundle["context"], "question": question}

rag_chain = RunnableLambda(make_context) | chat_prompt | llm | StrOutputParser()

user_queries = [
   "Is AAPL a stock or bond?",
   "What type of instrument is USTB?",
   "Which authority regulates MSFT?",
   "Which equities are regulated by the SEC, and who issues them?",
   "What stocks do you know?  What bonds?"
]

for i, q in enumerate(user_queries, start=1):
    print(f"\n=== Query {i}: {q}")
    try:
        ans = rag_chain.invoke(q)
        print(ans)
    except Exception as e:
        print(f"[error] {e}")


=== Query 1: Is AAPL a stock or bond?
AAPL is a stock, as indicated by the entity [E1] which describes Stock AAPL [AAPL] issued by Apple Inc.

=== Query 2: What type of instrument is USTB?
USTB is a bond issued by Victory Capital [E1].

=== Query 3: Which authority regulates MSFT?
The authority that regulates MSFT is the SEC [E1].

=== Query 4: Which equities are regulated by the SEC, and who issues them?
The equities regulated by the SEC include:

1. Stock MSFT [MSFT] issued by Microsoft Corp [E1].
2. Stock AAPL [AAPL] issued by Apple Inc [E2].

=== Query 5: What stocks do you know?  What bonds?
I know the following stocks and bonds:

**Stocks:**
1. Stock AAPL [AAPL] issued by [E1] Apple Inc, regulated by [E1] SEC.
2. Stock MSFT [MSFT] issued by [E3] Microsoft Corp, regulated by [E3] SEC.

**Bonds:**
1. Bond USTB [USTB] issued by Victory Capital [E2].
