In [102]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

In [103]:
load_dotenv()

True

In [104]:
api_key = os.getenv("AZURE_OPENAI_API_KEY")  
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
api_version = os.getenv("OPENAI_API_VERSION")
deployment_name = "gpt-4"

# Initialize Azure OpenAI client
client = openai.AzureOpenAI(
    api_key=api_key,
    azure_endpoint=endpoint,
    api_version=api_version
)

In [105]:
# response = client.chat.completions.create(
#     model=deployment_name, 
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": "I am going to Paris, what should I see?"},
#     ],
#     max_tokens=350,
#     temperature=1.0,
#     top_p=1.0
# )
# print(response.choices[0].message.content)

In [106]:
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

In [107]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model="gpt-4",
        max_tokens=350,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    sleep(8)
    return nlp_results


# Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships
def extract_entities_relationships(prompt_template):
    start = timer()
    files = glob.glob(f"../hp_text/hp_test.txt")
    system_msg = "You are an expert in knowledge graph construction and the Harry Potter series. Your task is to generate structured data suitable for a Neo4j graph database, representing key entities (characters, locations, magical_objects, spells, potions, creatures) and their relationships."
    results = []
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            with open(file, "r") as f:
                text = f.read().rstrip()
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_gpt(prompt, system_msg=system_msg)
                print(result)
                results.append(json.loads(result))
        except Exception as e:
            print(f"Error processing {file}: {e}")
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results


# Function to take a json-object of entitites and relationships and generate cypher query for creating those entities
def generate_cypher(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}

    # loop through our json object
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {len(json_obj)}")
        for entity in obj["entities"]:
            label = entity["label"]
            id = entity["id"]
            id = id.replace("-", "").replace("_", "")
            properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)

    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements


# Final function to bring all the steps together
def ingestion_pipeline(folders):
    # Extrating the entites and relationships from each folder, append into one json_object
    entities_relationships = []
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))

    # Generate and execute cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for i, stmt in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmt)
        except Exception as e:
            with open("failed_statements.txt", "w") as f:
                f.write(f"{stmt} - Exception: {e}\n")

In [108]:
prompt = """Extract characters and relationships from the following text to construct a knowledge graph for Neo4j.
    
    Instructions:
    
    1. Identify Characters (nodes)
    
    2. Identify relationships (edges) between characters.
    
    3. The output should look like :
    {
        "char_obj": [{"id":string, "name":string}],
        "char_sub": [{"id":string, "name":string}]
        "action": ["char_obj['id']|HAS_ACTION|char_sub['id']"]
    }
    
    Example Output:
    {
        "char_obj": [{"id":1, "name":Harry}],
        "char_sub": [{"id":2, "name":Ron}]
        "action": ["1|make friend with|2"]
    }
    
    Example Output:
    {
        "char_obj": [{"id":7, "name":McGonagall}],
        "char_sub": [{"id"5, "name":Hagrid}]
        "action": ["7|whispered, patting|5"]
    }
    
    
    Text:
    $ctext
    """

In [109]:
print(extract_entities_relationships(prompt))

Extracting entities and relationships for ../hp_text/hp_test.txt
{
    "char_obj": [
        {"id": "1", "name": "Mr. Dursley"},
        {"id": "2", "name": "Mrs. Dursley"},
        {"id": "3", "name": "Dudley"}
    ],
    "char_sub": [
        {"id": "4", "name": "Grunnings"}
    ],
    "action": [
        "1|is director of|4",
        "2|spied on|neighbors",
        "1|married to|2",
        "1|parent of|3",
        "2|parent of|3"
    ]
}
Pipeline completed in 65.1079055830487 seconds
[{'char_obj': [{'id': '1', 'name': 'Mr. Dursley'}, {'id': '2', 'name': 'Mrs. Dursley'}, {'id': '3', 'name': 'Dudley'}], 'char_sub': [{'id': '4', 'name': 'Grunnings'}], 'action': ['1|is director of|4', '2|spied on|neighbors', '1|married to|2', '1|parent of|3', '2|parent of|3']}]
