In [1]:
from __future__ import annotations

import os
from typing import List, Optional, Dict, Type, Union

from dotenv import load_dotenv
from pydantic import BaseModel, Field, create_model, field_serializer
from neo4j import GraphDatabase

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [2]:
load_dotenv()

OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5-mini")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j")

STRUCTURED SCHEMA (Pydantic) — only allowed entities/edges appear

In [14]:
from entities import Entity_Collector

ENTITY_MODEL_REGISTRY: Dict[str, Type[BaseModel]] = {
    cls.__name__: cls for cls in Entity_Collector()
}

len(ENTITY_MODEL_REGISTRY)

36

In [4]:
# ---------- 2) (Toy) prefilter that decides which types to include ----------
def prefilter_entity_types(all_types: List[str]) -> List[str]:
    """
    Replace with your real logic. For now, pretend we only want protein, gene, drug.
    """
    allowed = {"Protein", "Gene", "Drug"}
    return [t for t in all_types if t in allowed]

# ---------- 3) Dynamic Entities model factory ----------
def build_entities_model(selected_types: List[str]) -> Type[BaseModel]:
    """
    Create a Pydantic model class with *only* the selected types as optional List fields.
    Each field is omitted from the final JSON if None.
    """
    fields = {}
    for t in selected_types:
        item_model = ENTITY_MODEL_REGISTRY[t]
        # Optional[List[item_model]] default None
        fields[t + "s"] = (Optional[List[item_model]], Field(default=None))

    model_name = "ExtractedEntities" 
    # Create the model
    Entities = create_model(model_name, **fields, __doc__="Holds the lists of extracted entities divided by type")

    return Entities

In [5]:
# test above fucntions
all_types = list(ENTITY_MODEL_REGISTRY.keys())
selected_types = prefilter_entity_types(all_types)
EntitiesModel = build_entities_model(selected_types)

print(type(EntitiesModel))
EntitiesModel.model_fields

<class 'pydantic._internal._model_construction.ModelMetaclass'>


{'Proteins': FieldInfo(annotation=Union[List[Protein], NoneType], required=False, default=None),
 'Genes': FieldInfo(annotation=Union[List[Gene], NoneType], required=False, default=None),
 'Drugs': FieldInfo(annotation=Union[List[Drug], NoneType], required=False, default=None)}

In [6]:
EntitiesModel.model_json_schema() # also docstrings of nested classes are included !!!

{'$defs': {'Drug': {'description': 'A chemical or biologic agent used to treat, cure, prevent, or diagnose disease.\nExamples: "Imatinib", "Aspirin", "Trastuzumab".',
   'properties': {'name': {'description': 'Name of the drug as it appears in the text.',
     'title': 'Name',
     'type': 'string'}},
   'required': ['name'],
   'title': 'Drug',
   'type': 'object'},
  'Gene': {'description': 'A stretch of DNA (or locus) that encodes (or is associated with) a functional product, such as an RNA or protein.\nExamples: "BRCA1", "TP53", "EGFR gene".',
   'properties': {'name': {'description': 'Name of the gene as it appears in the text.',
     'title': 'Name',
     'type': 'string'}},
   'required': ['name'],
   'title': 'Gene',
   'type': 'object'},
  'Protein': {'description': 'A polypeptide chain (or set of chains) that folds into a functional 3D structure, often acting as an enzyme, receptor, structural molecule, transporter, etc.\nExamples: "Hemoglobin", "EGFR", "p53" (TP53 protein).'

SYSTEM PROMPT

In [6]:
# ---------- 4) Prompt builder that reflects the selected types ----------
def build_system_prompt(selected_types: List[str]) -> str:
    type_list = ", ".join(selected_types)
    return f"""You are a state-of-the-art biological information extraction model that performs Named Entity Recognition (NER);
    
Return ONLY these entity types: {type_list}.

INSTRUCTIONS:
- Extract entities only if they match the allowed entity types.
- Use the exact surface form from the text for names (do not normalize or invent).
- Do not repeat/duplicate entities even if they appear multiple times in the text.
- If a field isn’t stated in the text, set it to None (do not guess).
- If nothing is found for a type, it should be None (so it can be dropped from the JSON).
- - Keep concise entities names, no need to specify their types in the name (e.g from "... protein pt42 ..." extract "pt42" as name not "protein pt42").

Output MUST conform to the provided schema you were given via the structured output tool.
"""


LLM CHAIN

In [7]:
# ---------- 5) Chain factory ----------
def build_chain(selected_types: List[str], OPENAI_MODEL: str = "gpt-5-mini"):

    EntitiesExtracted = build_entities_model(selected_types)
    SYSTEM_PROMPT = build_system_prompt(selected_types)

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", SYSTEM_PROMPT),
            ("user", "<Input Text Start>\n\n{input_text}\n\n<Input Text End>\n\nReturn the structured extraction."),
        ]
    )

    llm = ChatOpenAI(model=OPENAI_MODEL, temperature=0, reasoning={"effort": "low"})
    chain = prompt | llm.with_structured_output(EntitiesExtracted)
    return chain, EntitiesExtracted

In [8]:
# ---------- 6) Public API ----------
ALL_TYPES = ["Protein", "Gene", "Disease", "Drug", "Cell"]


def extract(text: str, OPENAI_MODEL: str = "gpt-5-mini") -> Dict:
    selected_types = prefilter_entity_types(ALL_TYPES)            # <- your real prefilter goes here
    chain, Entities = build_chain(selected_types, OPENAI_MODEL)
    result = chain.invoke({"input_text": text})                   # a EntitiesDynamic instance
    # Dump w/ exclude_none to drop anything not present
    data = result.model_dump(exclude_none=True)

    # Optional: remove keys that are empty lists (in case LLM returned [])
    data = {k: v for k, v in data.items() if v not in (None, [], {})}
    return data

In [9]:
sample_text = (
        "In Human cells, the TP53 gene is translated into the p53 protein. "
        "For years, we considered p53 as a major factor in causing cancer."
        "But recent findings made by Harvard BioLab in 2020 proved that it was not the case, and p53 has no role in cancer."
        "They found out that also gene 'CGN123' and protein p32 have causality implication for cancer, often by loss of tumor suppression."
        "They can be inhibited by giving Imatinib to the patients twice a day."
    )

print("== Extracting NER/RE with schema constraints ==")
extraction = extract(sample_text)
print(extraction)

== Extracting NER/RE with schema constraints ==
{'Proteins': [{'name': 'p53'}, {'name': 'p32'}], 'Genes': [{'name': 'TP53'}, {'name': 'CGN123'}], 'Drugs': [{'name': 'Imatinib'}]}
