In [1]:
import os, re
from dotenv import load_dotenv
from openai import OpenAI
from langchain_neo4j import Neo4jGraph

load_dotenv()

NEO4J_URI      = os.environ["NEO4J_URI"]
NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]

OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")     # e.g., http://hanoi2.ucd.ie/v1  (or None for OpenAI)
OPENAI_API_KEY  = os.environ["OPENAI_API_KEY"]
EMBED_MODEL     = os.getenv("EMBED_MODEL", "text-embedding-3-small")
CHAT_MODEL      = os.getenv("CHAT_MODEL", "gpt-4o-mini")

kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
client = OpenAI(base_url=OPENAI_BASE_URL, api_key=OPENAI_API_KEY)

VEC_INDEX = "searchable_embedding_idx"
CH_TAG    = "Ch1"   # used when embedding Chapter 1 below


In [None]:
def probe_dim() -> int:
    v = client.embeddings.create(model=EMBED_MODEL, input="dimension probe").data[0].embedding
    return len(v)

def ensure_vector_index(dim: int):
    kg.query(f"""
    CREATE VECTOR INDEX {VEC_INDEX} IF NOT EXISTS
    FOR (n:Searchable) ON (n.embedding)
    OPTIONS {{
      indexConfig: {{ `vector.dimensions`: {dim}, `vector.similarity_function`: 'cosine' }}
    }}
    """)

dim = probe_dim()
ensure_vector_index(dim)
print("Vector index ensured. dim =", dim)


In [3]:
cypher_ch1 = """
MERGE (ch1:Chapter:Searchable {entity_id:'Ch1', entity_type:'chapter'})
SET ch1.name='Technical Guidelines for High-Quality and Low-Emission Rice Production — Chapter 1: Purpose & Scope',
    ch1.chapter='Ch1',
    ch1.file_path='book_1.pdf',
    ch1.source_id='chunk-ch1-001',
    ch1.summary='Purpose and scope of the handbook under the One Million Hectares Project (Decision 1490/QĐ-TTg, 27 Nov 2023).',
    ch1.description='Chapter 1 frames the handbook within the national One Million Hectares Project for high-quality, low-emission rice in the Mekong Delta. It consolidates technical standards across cultivation, harvesting, post-harvest handling, and circular straw management, with support from IRRI and based on Vietnam 2022/2023 guidelines, mechanization protocols, research, field-tested practices, and expert inputs. It clarifies target users (farmers, cooperatives, farm operators, agribusiness) and the main application area (specialized high-quality rice zones in the Mekong Delta), with adoption elsewhere where appropriate.',
    ch1.keywords=['One Million Hectares','Mekong Delta','Green Growth','Low emission','IRRI','Technical guidelines','Purpose','Scope'],
    ch1.search_text=ch1.name+' | '+ch1.summary+' | '+ch1.description+' | One Million Hectares | Mekong Delta | green growth | low emission | cultivation | harvesting | post-harvest | straw management'
WITH ch1
MERGE (proj:Project:Searchable {entity_id:'One Million Hectares Project', entity_type:'project'})
SET proj.name='One Million Hectares Project (High-Quality, Low-Emission Rice, Mekong Delta, by 2030)',
    proj.chapter='Ch1',
    proj.decision='1490/QĐ-TTg',
    proj.decision_date=date('2023-11-27'),
    proj.summary='National project to develop one million hectares of high-quality, low-emission rice by 2030.',
    proj.description='Approved by the Prime Minister under Decision 1490/QĐ-TTg (27 Nov 2023). Aligns rice production with green growth via standards for water (AWD), mechanized sowing, balanced fertilization, IPM, and circular straw management.',
    proj.keywords=['1M ha','Green growth','Low emission','Decision 1490/QĐ-TTg','2030','Mekong Delta'],
    proj.search_text=proj.name+' | '+proj.summary+' | '+proj.description+' | decision 1490 | 2030 | low emission | Mekong Delta'
WITH ch1, proj
MERGE (gov:Organization:Searchable {entity_id:'Vietnam Government', entity_type:'organization', name:'Vietnam Government'})
SET gov.chapter='Ch1',
    gov.summary='Approving authority for the One Million Hectares Project.',
    gov.description='Central government authority that approved Decision 1490/QĐ-TTg enabling the national project.',
    gov.search_text=gov.name+' | approving authority | Decision 1490 | One Million Hectares Project'
WITH ch1, proj, gov
MERGE (irri:Organization:Searchable {entity_id:'International Rice Research Institute', entity_type:'organization', name:'International Rice Research Institute'})
SET irri.chapter='Ch1',
    irri.summary='Technical supporter of the handbook.',
    irri.description='IRRI provided technical support on cultivation standards, water-saving (AWD), mechanized sowing, and low-emission practices for the Mekong Delta.',
    irri.search_text=irri.name+' | IRRI | technical support | AWD | mechanization | low emission'
WITH ch1, proj, gov, irri
MERGE (dept:Organization:Searchable {entity_id:'Department of Crop Production', entity_type:'organization', name:'Department of Crop Production'})
SET dept.chapter='Ch1',
    dept.summary='Issued the technical handbook.',
    dept.description='The Department of Crop Production issued the Technical Guidelines to operationalize the One Million Hectares Project.',
    dept.search_text=dept.name+' | handbook issuer | technical guidelines | One Million Hectares'
WITH ch1, proj, gov, irri, dept
MERGE (proj)-[:APPROVED_BY {description:'Approved under Decision 1490/QĐ-TTg on 2023-11-27'}]->(gov)
MERGE (dept)-[:ISSUED]->(ch1)
MERGE (irri)-[:SUPPORTED]->(ch1)
MERGE (ch1)-[:FOCUSES_ON]->(proj)
WITH ch1
UNWIND [
  {id:'Individual Farmers',        desc:'Smallholder rice growers who adopt field practices.'},
  {id:'Agricultural Cooperatives', desc:'Groups coordinating inputs, machinery, aggregation, and training.'},
  {id:'Farm Operators',            desc:'Professional operators managing larger areas or providing services.'},
  {id:'Agribusiness Enterprises',  desc:'Commercial value-chain actors in mechanization, logistics, processing, markets.'}
] AS stakeholder
MERGE (s:StakeholderGroup:Searchable {entity_id:stakeholder.id, entity_type:'stakeholder', name:stakeholder.id})
SET s.chapter='Ch1',
    s.description=stakeholder.desc,
    s.search_text=s.name+' | '+s.description+' | stakeholder | users | adoption'
MERGE (ch1)-[:TARGETS]->(s)
WITH ch1
MERGE (region:Region:Searchable {entity_id:'Mekong Delta', name:'Mekong Delta'})
SET region.chapter='Ch1',
    region.description='Primary application area: specialized high-quality rice zones in the Mekong Delta. Adoption elsewhere where appropriate.',
    region.search_text=region.name+' | specialized high-quality rice zones | Mekong Delta | application area'
WITH ch1, region
UNWIND ['Cultivation Techniques','Harvesting and Post-Harvest Management','Straw Management'] AS domain
MERGE (d:Practice:Searchable {entity_id:domain, entity_type:'domain', name:domain})
SET d.chapter='Ch1',
    d.description='Technical domain addressed in Chapter 1.',
    d.search_text=d.name+' | technical domain | Chapter 1 | guidelines'
MERGE (ch1)-[:COVERS]->(d)
WITH ch1
MERGE (land:Practice:Searchable {entity_id:'Land Preparation', name:'Land Preparation'})
SET land.chapter='Ch1',
    land.description='Mechanized (e.g., laser) leveling and field preparation tailored to cropping systems; supports uniform germination, AWD readiness, and mechanized sowing.',
    land.search_text=land.name+' | mechanized leveling | laser leveling | AWD readiness | uniform germination | cropping system'
WITH ch1, land
MERGE (water:Practice:Searchable {entity_id:'Water Management', name:'Water Management'})
SET water.chapter='Ch1',
    water.description='Alternate Wetting and Drying (AWD) with decision support (water tubes/gauges). Limit continuous water stagnation to fewer than 30 days post-sowing to cut methane and save water.',
    water.search_text=water.name+' | AWD | Alternate Wetting and Drying | methane reduction | water saving | <30 days stagnation | tubes | gauges'
WITH ch1, land, water
MERGE (sowing:Practice:Searchable {entity_id:'Sowing Techniques', name:'Sowing Techniques'})
SET sowing.chapter='Ch1',
    sowing.description='Mechanized broadcast or cluster sowing integrated with fertilizer deep placement/side-banding to improve nutrient use efficiency and reduce emissions.',
    sowing.search_text=sowing.name+' | mechanized broadcast | cluster sowing | fertilizer placement | deep placement | side-banding | NUE'
WITH ch1, land, water, sowing
MERGE (fert:Practice:Searchable {entity_id:'Fertilization', name:'Fertilization'})
SET fert.chapter='Ch1',
    fert.description='Balanced, season-aligned NPK and supplements; emphasize organic and environmentally friendly fertilizers to improve soil health and reduce emissions.',
    fert.search_text=fert.name+' | balanced NPK | organic fertilizers | soil health | low emission | season-aligned'
WITH ch1, land, water, sowing, fert
MERGE (ipm:Practice:Searchable {entity_id:'Integrated Pest Management', name:'Integrated Pest Management'})
SET ipm.chapter='Ch1',
    ipm.description='Follow the 4 Rights: right time, right product, right dosage, right method. Encourage biological agents; minimize toxic synthetic pesticides; aim for zero pesticide residues at harvest.',
    ipm.search_text=ipm.name+' | IPM | 4 Rights | biological control | zero residue | minimize toxic pesticides'
WITH ch1, land, water, sowing, fert, ipm
MERGE (harvest:Practice:Searchable {entity_id:'Harvesting and Post-Harvest Handling', name:'Harvesting and Post-Harvest Handling'})
SET harvest.chapter='Ch1',
    harvest.description='Harvest at optimal maturity using combine harvesters. Use efficient drying, storage, and transport to reduce energy use and post-harvest losses.',
    harvest.search_text=harvest.name+' | combine harvester | drying | storage | transport | loss reduction | energy efficiency'
WITH ch1, land, water, sowing, fert, ipm, harvest
MERGE (straw:Practice:Searchable {entity_id:'Straw Management', name:'Straw Management'})
SET straw.chapter='Ch1',
    straw.description='Shred and incorporate straw with Trichoderma-based composting agents when feasible. Reuse collected straw for mushroom cultivation, cattle feed, organic fertilizer, and other high-value products; avoid burning or burying raw straw.',
    straw.search_text=straw.name+' | straw incorporation | Trichoderma | mushroom cultivation | cattle feed | organic fertilizer | circular model | avoid burning'
WITH ch1, water, straw
MERGE (awd:Acronym:Searchable {entity_id:'AWD', name:'Alternate Wetting and Drying'})
SET awd.chapter='Ch1',
    awd.description='Water-saving irrigation technique that reduces methane emissions by allowing non-flooded periods.',
    awd.search_text=awd.name+' | AWD | water-saving | methane reduction | irrigation'
WITH ch1, water, straw, awd
MERGE (mach:Concept:Searchable {entity_id:'Mechanization', name:'Mechanization'})
SET mach.chapter='Ch1',
    mach.description='Use of machinery in leveling, sowing, and harvesting to improve efficiency and reduce emissions.',
    mach.search_text=mach.name+' | leveling | sowing | harvesting | efficiency | low emission'
WITH ch1, water, straw, awd, mach
MERGE (bio:Input:Searchable {entity_id:'Biological Agents', name:'Biological Agents'})
SET bio.chapter='Ch1',
    bio.description='Eco-friendly pest control agents (e.g., fungi, bacteria, parasitoids) preferred in IPM.',
    bio.search_text=bio.name+' | biological control | eco-friendly | IPM'
WITH ch1, water, straw, awd, mach, bio
MERGE (tox:Chemical:Searchable {entity_id:'Toxic Synthetic Pesticides', name:'Toxic Synthetic Pesticides'})
SET tox.chapter='Ch1',
    tox.description='Hazardous pesticides minimized under IPM and avoided where possible.',
    tox.search_text=tox.name+' | avoid | hazardous | IPM'
WITH ch1, water, straw, awd, mach, bio, tox
MERGE (ghg:Concept:Searchable {entity_id:'Greenhouse Gas Emissions', name:'Greenhouse Gas Emissions'})
SET ghg.chapter='Ch1',
    ghg.description='Methane and other emissions from rice fields targeted for reduction through AWD and straw circularity.',
    ghg.search_text=ghg.name+' | methane | emission reduction | rice fields'
WITH ch1, water, straw, awd, mach, bio, tox, ghg
UNWIND [
  ['Mushroom Cultivation','Reuse straw as substrate for mushroom production.'],
  ['Cattle Feed','Use straw as livestock feed for ruminants.'],
  ['Organic Fertilizer','Compost straw into soil-enhancing organic fertilizer.'],
  ['Other High-Value Uses','Other economic and environmental value applications.']
] AS use
MERGE (u:UseCase:Searchable {entity_id:use[0], name:use[0]})
SET u.chapter='Ch1',
    u.description=use[1],
    u.search_text=u.name+' | straw reuse | circular economy'
WITH ch1, water, straw, awd, mach, bio, tox, ghg
MERGE (ch1)-[:COVERS]->(:Practice {entity_id:'Cultivation Techniques'})
MERGE (ch1)-[:COVERS]->(:Practice {entity_id:'Harvesting and Post-Harvest Management'})
MERGE (ch1)-[:COVERS]->(:Practice {entity_id:'Straw Management'})
MERGE (awd)-[:USED_IN {description:'Core method under water management for emission reduction'}]->(water)
MERGE (mach)-[:REDUCES]->(straw)
MERGE (mach)-[:REDUCES]->(water)
MERGE (mach)-[:USED_IN]->(land)
MERGE (mach)-[:USED_IN]->(harvest)
MERGE (ipm)-[:USES]->(bio)
MERGE (ipm)-[:AVOIDS]->(tox)
MERGE (water)-[:REDUCES]->(ghg)
MERGE (straw)-[:REDUCES]->(ghg)
MERGE (straw)-[:REUSED_AS]->(:UseCase {entity_id:'Mushroom Cultivation'})
MERGE (straw)-[:REUSED_AS]->(:UseCase {entity_id:'Cattle Feed'})
MERGE (straw)-[:REUSED_AS]->(:UseCase {entity_id:'Organic Fertilizer'})
MERGE (straw)-[:REUSED_AS]->(:UseCase {entity_id:'Other High-Value Uses'})
"""
kg.query(cypher_ch1)
print("Chapter 1 upserted.")


Chapter 1 upserted.


In [4]:
def fetch_to_embed_for_chapter(chapter_tag: str, limit: int = 256):
    return kg.query("""
        MATCH (n:Searchable)
        WHERE n.chapter = $chapter
          AND n.search_text IS NOT NULL
          AND (n.embedding IS NULL OR size(n.embedding) = 0)
        RETURN n.entity_id AS id, n.search_text AS text
        LIMIT $lim
    """, {"chapter": chapter_tag, "lim": limit})

def set_embeddings(rows):
    if not rows: return
    kg.query("""
        UNWIND $rows AS r
        MATCH (n:Searchable {entity_id: r.id})
        SET n.embedding = r.embedding
    """, {"rows": rows})

def embed_chapter(chapter_tag: str, batch: int = 256):
    count = 0
    while True:
        rows = fetch_to_embed_for_chapter(chapter_tag, batch)
        if not rows:
            break
        texts = [r["text"] for r in rows]
        embs  = client.embeddings.create(model=EMBED_MODEL, input=texts).data
        out   = [{"id": r["id"], "embedding": e.embedding} for r, e in zip(rows, embs)]
        set_embeddings(out)
        count += len(out)
    print(f"Embedded {count} nodes for {chapter_tag}")

embed_chapter(CH_TAG)


Embedded 22 nodes for Ch1


In [6]:
def embed_text(text: str):
    return client.embeddings.create(model=EMBED_MODEL, input=text).data[0].embedding

def expand_entities(ids: list[str], per_node_limit: int = 25):
    if not ids:
        return []
    return kg.query("""
        MATCH (n:Searchable) WHERE n.entity_id IN $ids
        MATCH (n)-[r]-(m)
        RETURN
           coalesce(n.entity_id, elementId(n))           AS src_id,
           coalesce(n.name, n.entity_id, labels(n)[0])   AS src_name,
           type(r)                                       AS rel_type,
           coalesce(r.description, r.search_text, '')    AS rel_desc,
           coalesce(m.entity_id, elementId(m))           AS dst_id,
           coalesce(m.name, m.entity_id, labels(m)[0])   AS dst_name,
           m.summary                                     AS dst_summary,
           m.description                                 AS dst_description
        LIMIT $lim
    """, {"ids": ids, "lim": per_node_limit * max(1, len(ids))})


In [7]:
# Cell 7 — robust fact builder + context packer

def to_fact(row):
    rel   = row.get("rel_type") or "RELATED_TO"
    rdesc = (row.get("rel_desc") or "").strip()
    src   = row.get("src_name") or row.get("src_id") or "Source"
    dst   = row.get("dst_name") or row.get("dst_id") or "Target"

    fact = f"{src} --{rel}--> {dst}"
    if rdesc:
        fact += f" :: {rdesc}"

    # Build citations, skip Nones, dedupe while preserving order
    cites = [row.get("src_id"), row.get("dst_id")]
    cites = [c for c in cites if c]
    seen = set()
    cites = [c for c in cites if not (c in seen or seen.add(c))]

    return {"text": fact, "cites": cites}

def build_context(hits, expansions, max_nodes=6, max_facts=20):
    # Node blurbs from top vector hits
    node_cards = []
    for h in hits[:max_nodes]:
        blurb = h.get("summary") or h.get("description") or ""
        if not blurb:
            continue
        node_cards.append({
            "id": h["id"],
            "name": h["name"],
            "blurb": blurb.strip()
        })

    # Clean expansion rows and build facts
    clean_rows = [r for r in expansions if (r.get("src_id") or r.get("dst_id"))]
    facts = [to_fact(r) for r in clean_rows][:max_facts]

    return {"nodes": node_cards, "facts": facts}


In [8]:
# Cell 8 — LLM answerer

def answer_with_llm(question: str, context: dict) -> str:
    fact_lines = [f"- {f['text']} [refs: {', '.join(f['cites'])}]" for f in context["facts"]]
    node_lines = [f"- ({n['id']}) {n['name']}: {n['blurb']}" for n in context["nodes"]]

    system = (
        "You are RiceAI Expert. Answer using ONLY the provided facts and notes.\n"
        "Cite using the provided entity_ids in square brackets like [Ch1] or [Straw Management].\n"
        "Be precise and concise."
    )
    user = (
        f"Question: {question}\n\n"
        f"Nodes:\n" + "\n".join(node_lines[:10]) + "\n\n"
        f"Facts:\n" + "\n".join(fact_lines[:25]) + "\n\n"
        "Write the answer with inline citations. If unsure, say so."
    )
    resp = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[{"role":"system","content":system},{"role":"user","content":user}],
        temperature=0.1
    )
    return resp.choices[0].message.content.strip()


In [12]:
# Cell 9 — zero-config vector search (compute similarity in Python)

import re

DEFAULT_LABEL_ALLOW = ["Practice","Concept","Acronym","UseCase","Project","Region","StakeholderGroup","Chapter"]

def _infer_chapter_tag_from_text(text: str) -> str | None:
    m = re.search(r"\bch(?:apter)?\s*([0-9]{1,2})\b", text, flags=re.IGNORECASE)
    return f"Ch{int(m.group(1))}" if m else None

def _score_to_similarity(score) -> float:
    try:
        s = float(score)
    except Exception:
        return 0.0
    # cosine distance -> similarity
    sim = 1.0 - s
    # clamp in Python (no greatest/least in Cypher)
    if sim < 0.0: sim = 0.0
    if sim > 1.0: sim = 1.0
    return sim

def vector_search_nodes_auto(question: str, top_k: int = 12):
    """
    Zero-config vector search:
      - infers chapter tag from the question if present
      - filters to useful labels by default
      - returns raw score (distance) and Python-computed similarity in [0,1]
      - fallback pass if chapter-filtered search is empty
      - de-duplicates by entity_id
    """
    qvec = embed_text(question)
    chapter = _infer_chapter_tag_from_text(question)

    def _run(pass_chapter: str | None):
        params = {
            "k": top_k,
            "qvec": qvec,
            "label_allow": DEFAULT_LABEL_ALLOW
        }
        where = [
            "node.search_text IS NOT NULL",
            "ANY(lbl IN labels(node) WHERE lbl IN $label_allow)"
        ]
        if pass_chapter:
            where.append("node.chapter = $chapter")
            params["chapter"] = pass_chapter

        where_clause = " AND ".join(where)

        cypher = f"""
        WITH $qvec AS qv
        CALL db.index.vector.queryNodes('{VEC_INDEX}', $k, qv)
        YIELD node, score
        WHERE {where_clause}
        RETURN labels(node) AS labels,
               node.entity_id AS id,
               coalesce(node.name, node.entity_id) AS name,
               node.summary AS summary,
               node.description AS description,
               node.chapter AS chapter,
               score
        ORDER BY score ASC
        """
        rows = kg.query(cypher, params)
        # add similarity in Python
        for r in rows:
            r["similarity"] = _score_to_similarity(r.get("score"))
        return rows

    hits = _run(chapter) or _run(None)

    # De-dup by entity_id (keep best)
    seen = set()
    dedup = []
    for h in hits:
        if h["id"] in seen:
            continue
        seen.add(h["id"])
        dedup.append(h)
    return dedup


In [13]:
# Cell 10 — single-call ask()

def ask(question: str, top_k: int = 12, expand: int = 5, per_node_limit: int = 25):
    """
    User-only input:
      1) vector search with auto label filter + chapter inference + fallback
      2) 1-hop expansion on top N nodes
      3) pack context + call LLM
    """
    hits = vector_search_nodes_auto(question, top_k=top_k)
    if not hits:
        return {
            "answer": "I couldn't find anything relevant in the knowledge graph for that query.",
            "hits": [],
            "facts": []
        }

    ids = [h["id"] for h in hits[:expand]]
    ex  = expand_entities(ids, per_node_limit=per_node_limit)
    context = build_context(hits, ex)
    answer  = answer_with_llm(question, context)
    return {"answer": answer, "hits": hits, "facts": context["facts"]}


In [14]:
# Cell 11 — run a query (no parameters besides the question)

res = ask("What does chapter 1 recommend for water and straw management?")
print(res["answer"])

print("\nTop hits:")
for h in res["hits"][:5]:
    print(f"- [{h['id']}] {h['name']} (sim={h['similarity']:.3f})")

print("\nFacts used:")
for f in res["facts"][:8]:
    print(f"- {f['text']}  [refs: {', '.join(f['cites'])}]")


Chapter 1 recommends the following for water and straw management:

1. **Water Management**: Implement Alternate Wetting and Drying (AWD) as a core method to reduce greenhouse gas emissions and save water. It is advised to limit continuous water stagnation to fewer than 30 days post-sowing to further cut methane emissions [Water Management][AWD].

2. **Straw Management**: Reuse straw by composting it into soil-enhancing organic fertilizer or utilizing it as a substrate for mushroom production [Organic Fertilizer][Mushroom Cultivation].

Top hits:
- [Sowing Techniques] Sowing Techniques (sim=0.299)
- [Water Management] Water Management (sim=0.295)
- [Land Preparation] Land Preparation (sim=0.292)
- [Mushroom Cultivation] Mushroom Cultivation (sim=0.292)
- [AWD] Alternate Wetting and Drying (sim=0.290)

Facts used:
- Water Management --USED_IN--> Alternate Wetting and Drying :: Core method under water management for emission reduction  [refs: Water Management, AWD]
- Water Management --R