In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import openai
import json
from neo4j import GraphDatabase
import glob
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [40]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
neo4j_url = os.getenv("NEO4J_URL")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))
gds.verify_connectivity()

**Constructing knowledge graph**

In [10]:
def gpt_call(text):
    extraction_template = """
    From the weekly report below, extract (as many as possible) relationships between environmental events in Thai by following these steps:
    0. Check the top of the document for the date of the report, as you might need it
    1. Identify environmental events that are associated with other resulting events and generate as comma-separated format:
        id:string, name:string, date:string, place:string 
        # 'id' must be unique among events; include the date from the 0. step to 'id'
        # 'name' of the event should be a comprehensive description of the event
        # 'name' of the event should not include causation or date
        # if the event is not associated with numerical date, then assign the date from the 0. step

    2. Generate each relationship as triples of head, relationship, and tail. To refer the head and tail entity, use their respective 'id' property.
    Only generate casuation relationships.
    Follow the format: event|CAUSES|event

    3. The output should look like:  # put a set of curly brackets around the final output
    "events": ["id":string, "name":string, "date":string, "place":string], # for each event, put a set of curly brackets around
    "relationships": ["eventid|CAUSES|eventid"] 

    Weekly report:
    {report}
    """

    extraction_prompt = PromptTemplate.from_template(extraction_template) 
    llm = ChatOpenAI(model="gpt-4o",
                     max_tokens=15000)
    prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful environmental, water expert who extracts information from reports."),
        HumanMessagePromptTemplate.from_template("{report}")
    ]
)

    chain = {"report": RunnablePassthrough() } | extraction_prompt | prompt | llm | StrOutputParser()
    return chain.invoke(text)

def extract_entities_relationships():
    files = glob.glob("data/*.md")
    
    result = []
    for i, file in enumerate(files):
        print(f"Processing file {i + 1}")
        try:
            with open(file, "r") as f:
                report = f.read()
                res = gpt_call(report)

                clean = res.replace('```json', '').replace('```', '').strip()
                j = json.loads(clean)

                result.append(j)
        except Exception as e:
            print(f"Error processing file {i + 1}: {str(e)}")
    return result

In [33]:
def generate_cypher(json_obj):
    label = "Event"
    event_statements = []
    r_statements = []
    label_map = {}

    for i, obj in enumerate(json_obj):
        for event in obj["events"]:
            id = event["id"]
            id = id.replace("_", "").replace("-", "")
            properties = {k:v for k,v in event.items() if k not in ["id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            event_statements.append(cypher)
            label_map[id] = label
        
        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = label_map[src_id]
            tgt_label = label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)
        
    with open("cyphers.txt", "w") as outfile:
            outfile.write("\n".join(event_statements + r_statements))

    return event_statements + r_statements

In [38]:
def ingestion_pipeline():
    with open("test.json", "r", encoding="utf-8") as f:
        data = f.read()
        obj = json.loads(data)
    
    cypher = generate_cypher(obj)
    for i, st in enumerate(cypher):
        print(f"Executing cypher statement {i + 1}")
        try:
            gds.execute_query(st)
        except Exception as e:
            print(f"Error executing cypher statement {i + 1}: {str(e)}")

In [67]:
# results = extract_entities_relationships()
# with open("test.json", "w", encoding="utf-8") as f:
#     json.dump(results, f, ensure_ascii=False, indent=4)


# run to persist to neo4j
# ingestion_pipeline()

**GraphRag**

In [9]:
import os
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings

In [94]:
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=neo4j_url,
    username=neo4j_user,
    password=neo4j_password)

In [29]:
# vector index search
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    url=neo4j_url,
    username=neo4j_user,
    password=neo4j_password,
    index_name="id",
    node_label="Event",
    text_node_properties=["date", "name", "place"],
    embedding_node_property="embedding"
)

In [33]:
d = vector_index.similarity_search("พายุฝน")[0]

In [120]:
from langchain.chains import GraphCypherQAChain
from langchain.agents import initialize_agent, AgentType, Tool

def get_cypher_chain():
    graph.refresh_schema()

    cypher_chain = GraphCypherQAChain.from_llm(
    cypher_llm = ChatOpenAI(model="gpt-4"),
    qa_llm = ChatOpenAI(model="gpt-4"),
    graph=graph,
    verbose=True

    )
    return cypher_chain
cypher_chain = get_cypher_chain()
tools = [
    Tool(
        name="Causation",
        func= cypher_chain.run,
        description="Useful for finding consequences of an event.",
        
    )
]

llm = ChatOpenAI(model_name="gpt-4o")
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    verbose=True,
)

agent.run("พายุฤดูร้อนก่อให้เกิดอะไร")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine the consequences of a summer storm, I will use the Causation tool to find out what typically results from such an event.

Action: Causation
Action Input: พายุฤดูร้อนก่อให้เกิดอะไร[0m

[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (e:Event {name: 'พายุฤดูร้อน'})-[:CAUSES]->(result:Event) RETURN result.name[0m
Full Context:
[32;1m[1;3m[{'result.name': 'ฝนฟ้าคะนอง'}, {'result.name': 'ลมกระโชกแรง'}, {'result.name': 'น้ำท่วมฉับพลัน'}, {'result.name': 'น้ำป่าไหลหลาก'}][0m

[1m> Finished chain.[0m

Observation: [36;1m[1;3mพายุฤดูร้อนสามารถก่อให้เกิดฝนฟ้าคะนอง, ลมกระโชกแรง, น้ำท่วมฉับพลัน และน้ำป่าไหลหลาก.[0m
Thought:[32;1m[1;3mI now know the final answer.

Final Answer: พายุฤดูร้อนสามารถก่อให้เกิดฝนฟ้าคะนอง, ลมกระโชกแรง, น้ำท่วมฉับพลัน และน้ำป่าไหลหลาก.[0m

[1m> Finished chain.[0m


'พายุฤดูร้อนสามารถก่อให้เกิดฝนฟ้าคะนอง, ลมกระโชกแรง, น้ำท่วมฉับพลัน และน้ำป่าไหลหลาก.'