# Moonshot  AI Knowledge Graph -> Graph DB

### Outline
1. Configuration
2. Helper Functions
3. Prompts
4. Running the pipeline

In [57]:
# pip install openai
# pip install neo4j

In [58]:
# pip install python-dotenv

In [59]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

### 1. Configuration

In [60]:
# Load environment variables
load_dotenv()

True

In [61]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# openai.api_key = os.getenv("OPENAI_KEY")


# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


In [62]:
# Neo4j configuration & constraints
neo4j_url = os.getenv("NEO4J_CONNECTION_URL")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")
# print(f" neo4j_url == {neo4j_url}")
# print(f" neo4j_user == {neo4j_user}")
# print(f" neo4j_password == {neo4j_password}")

gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

### 2. Helper Functions

In [63]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    # completion = openai.ChatCompletion.create(
    #     engine=openai_deployment,
    #     max_tokens=15000,
    #     temperature=0,
    #     messages=[
    #         {"role": "system", "content": system_msg},
    #         {"role": "user", "content": file_prompt},
    #     ],
    # )
    completion = openai.chat.completions.create(
                    model=openai_deployment,
                    max_tokens=15000,
                    temperature=0,
                    messages=[
                        {"role": "system", "content": system_msg},
                        {"role": "user", "content": file_prompt},
                    ]
                )
    nlp_results = completion.choices[0].message.content
    sleep(8)
    return nlp_results


# Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships
def extract_entities_relationships(folder, prompt_template):
    start = timer()
    files = glob.glob(f"./Data/{folder}/*")
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(files)} files in {folder} folder")
    results = []
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            with open(file, "r") as f:
                text = f.read().rstrip()
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_gpt(prompt, system_msg=system_msg)
                results.append(json.loads(result))
        except Exception as e:
            print(f"Error processing {file}: {e}")
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results


# Function to take a json-object of entitites and relationships and generate cypher query for creating those entities
def generate_cypher(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}

    # loop through our json object
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {len(json_obj)}")
        for entity in obj["entities"]:
            label = entity["label"]
            id = entity["id"]
            id = id.replace("-", "").replace("_", "")
            properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)

    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements


# Final function to bring all the steps together
def ingestion_pipeline(folders):
    # Extrating the entites and relationships from each folder, append into one json_object
    entities_relationships = []
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))

    # Generate and execute cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for i, stmt in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmt)
        except Exception as e:
            with open("failed_statements.txt", "w") as f:
                f.write(f"{stmt} - Exception: {e}\n")

### 3. Defining Prompts

In [90]:
# Prompt for processing country policies briefs
# country_prompt_template = """
# From the Country Brief below, extract the following Entities & relationships described in the mentioned format 
# 0. ALWAYS FINISH THE OUTPUT. Never send partial responses
# 1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
#    `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Country entity under `summary` property. You will have to generate as many entities as needed as per the types below:
#     Entity Types:
#     label:'Country',id:string,name:string;summary:string //Country mentioned in the brief; `id` property is the full name of the Country, in lowercase, with no capital letters, special characters, spaces or hyphens; Contents of original document must be summarized inside 'summary' property
#     label:'UNDP',id:string,name:string //UNDP Entity; `id` property is the name of the UNDP, in camel-case. Identify as many of the technologies used as possible
#     label:'Support',id:string,name:string;industry:string //Support that the Country was done for; `id` property is the name of the Support, in camel-case; 'industry' is the industry that the Support operates in, as mentioned in the Country brief.
    
# 2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
#     Relationship types:
#     Country|USES_POLICY|UNDP 
#     Country|HAS_LEGAL_SUPPORT|UNDP


# 3. The output should look like :
# {
#     "entities": [{"label":"Country","id":string,"name":string,"summary":string}],
#     "relationships": ["Countryid|USES_POLICY|UNDPid"]
# }

# Case Sheet:
# $ctext
# """


prompt_template = """
From the Brief below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for  Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Country entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Entity',id:string,name:string;summary:string 

2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. 
   Relationship property should be mentioned within brackets as comma-separated. 
   You will have to generate as many relationships as needed as defined below:
    Relationship types:
    Entity|RELATIONSHIP_TYPE|Entity 


3. The output should look like :
{
    "entities": [{"label":"Entity","id":string,"name":string,"summary":string}],
    "relationships": ["Entityid|RELATIONSHIP_TYPE|AnotherEntityid"]
}

Entity, RELATIONSHIP_TYPE and AnotherEntityid are to be generate by you based on the brief.

Case Sheet:
$ctext
"""



### 4. Running the pipeline


In [95]:
countries = {
    "albanian_2": prompt_template,
}

ingestion_pipeline(countries)

Running pipeline for 1 files in albanian_2 folder
Extracting entities and relationships for ./Data/albanian_2/alb-1-en.txt
Pipeline completed in 10.373755831999006 seconds
Generating cypher for file 1 of 1
Executing cypher statement 1 of 1
