# Moonshot  AI Knowledge Graph Cypher Query neo4j Approach

### Outline
1. Configuration
2. Helper Functions
3. Prompts
4. Running the pipeline

In [1]:
# pip install openai
# pip install neo4j

In [2]:
# pip install python-dotenv

In [3]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document


### 1. Configuration

In [4]:
# Load environment variables
load_dotenv()

True

In [5]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# openai.api_key = os.getenv("OPENAI_KEY")


# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


In [6]:
# Neo4j configuration & constraints
neo4j_url = os.getenv("NEO4J_CONNECTION_URL")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")
# print(f" neo4j_url == {neo4j_url}")
# print(f" neo4j_user == {neo4j_user}")
# print(f" neo4j_password == {neo4j_password}")

gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

### 2. Helper Functions

In [7]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    # sleep(35)
    completion = openai.chat.completions.create(
                    model=openai_deployment,
                    max_tokens=15000,
                    temperature=0,
                    messages=[
                        {"role": "system", "content": system_msg},
                        {"role": "user", "content": file_prompt},
                    ]
                )
    nlp_results = completion.choices[0].message.content
    # print(f"NLP Result:::  {nlp_results}")
    sleep(8)
    return nlp_results


# Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships
def extract_entities_relationships(folder, prompt_template):
    start = timer()
    files = glob.glob(f"./Data/{folder}/*")
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(files)} files in {folder} folder")
    results = []
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            with open(file, "r") as f:
                text = f.read().rstrip()
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_gpt(prompt, system_msg=system_msg)
                results.append(json.loads(result))
        except Exception as e:
            print(f"Error processing {file}: {e}")
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results

# Function to take a json-object of entitites and relationships and generate cypher query for creating those entities
def generate_cypher(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}

    # print(f" generating cypher for {json_obj}")
    # loop through our json object
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {len(json_obj)}")
        for entity in obj["entities"]:
            label = entity["label"]
            id = entity["id"]
            id = id.replace("-", "").replace("_", "")
            properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)

    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements


# Final function to bring all the steps together
def ingestion_pipeline(folders):
    # Extrating the entites and relationships from each folder, append into one json_object
    entities_relationships = []
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))

    # Generate and execute cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for i, stmt in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmt)
        except Exception as e:
            with open("failed_statements.txt", "w") as f:
                f.write(f"{stmt} - Exception: {e}\n")

### 3. Defining Prompts

In [8]:

prompt_template = """
From the Brief below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for  Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Country entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Entity',id:string,name:string;summary:string 
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. 
   Relationship property should be mentioned within brackets as comma-separated. 
   You will have to generate as many relationships as needed as defined below:
    Relationship types:
    Entity|RELATIONSHIP_TYPE|Entity 
3. The output should look like :
{
    "entities": [{"label":"Entity","id":string,"name":string,"summary":string}],
    "relationships": ["Entityid|RELATIONSHIP_TYPE|AnotherEntityid"]
}
Entity, RELATIONSHIP_TYPE and AnotherEntityid are to be generate by you based on the brief.
Case Sheet:
$ctext
"""



### 4. Running the pipeline


In [9]:
countries = {
    "cleaned_text_manually": prompt_template,
}

ingestion_pipeline(countries)

Running pipeline for 1 files in cleaned_text_manually folder
Extracting entities and relationships for ./Data/cleaned_text_manually/AFG-CPD-2014-EN.txt
Error processing ./Data/cleaned_text_manually/AFG-CPD-2014-EN.txt: Error code: 400 - {'error': {'message': "This model's maximum context length is 16384 tokens. However, you requested 23962 tokens (8962 in the messages, 15000 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Pipeline completed in 1.1828097130000002 seconds


### Test the result


In [None]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain
from langchain.chat_models import ChatOpenAI
from langchain.graphs import Neo4jGraph
import os
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

os.environ["OPENAI_API_KEY"] =  os.getenv("OPENAI_KEY")
 

graph = Neo4jGraph(
    url=neo4j_url,
    username=neo4j_user,
    password=neo4j_password
)

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [None]:
cypher_chain.run("When did the Transformation Decade start and what even caused it to start?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (e1:Entity)-[r:TRANSFORMS]->(e2:Entity)
WHERE e2.name = "Transformation Decade"
RETURN e1.summary, r.summary[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I'm sorry, but I don't have any information on the Transformation Decade or what caused it to start."

### . Token limit each file  (optional)


In [None]:
import os
import nltk
nltk.download('punkt')  # Download NLTK tokenizer data if not already downloaded

# Function to split file into subfiles based on sentences
def split_file_sentences(filename, sentence_limit=5):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = nltk.sent_tokenize(content)  # Tokenize into sentences
        
        num_sentences = len(sentences)
        num_files = (num_sentences // sentence_limit) + 1
        
        for i in range(num_files):
            start = i * sentence_limit
            end = (i + 1) * sentence_limit
            sub_sentences = sentences[start:end]  # Extract sentences for subfile
            
            # Create subfile name (e.g., original_filename_part1.txt, original_filename_part2.txt, ...)
            subfile_name = f"{os.path.splitext(filename)[0]}_part{i + 1}.txt"
            
            with open(subfile_name, 'w', encoding='utf-8') as subfile:
                subfile.write('\n'.join(sub_sentences))  # Write sentences to subfile
        
        os.remove(filename)  # Remove the original file after splitting

# Directory containing the .txt files
directory = 'Data/cleaned_text_manually'

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        split_file_sentences(file_path)



# import os
# import nltk
# from nltk.tokenize import word_tokenize

# nltk.download('punkt')  # Download NLTK tokenizer data if not already downloaded

# # Function to read a file, tokenize its content, and split it into subfiles
# def split_file(filename, token_limit=500):
#     with open(filename, 'r', encoding='utf-8') as file:
#         content = file.read()
#         tokens = word_tokenize(content)  # Tokenize the content
        
#         num_tokens = len(tokens)
#         num_files = (num_tokens // token_limit) + 1
        
#         for i in range(num_files):
#             start = i * token_limit
#             end = (i + 1) * token_limit
#             sub_tokens = tokens[start:end]  # Extract tokens for subfile
            
#             # Create subfile name (e.g., original_filename_part1.txt, original_filename_part2.txt, ...)
#             subfile_name = f"{os.path.splitext(filename)[0]}_part{i + 1}.txt"
            
#             with open(subfile_name, 'w', encoding='utf-8') as subfile:
#                 subfile.write(' '.join(sub_tokens))  # Write tokens to subfile
        
#         os.remove(filename)  # Remove the original file after splitting

# # Directory containing the .txt files
# directory = 'Data/cleaned_text_manually'

# # Loop through each file in the directory
# for filename in os.listdir(directory):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(directory, filename)
#         split_file(file_path)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davidoluyalegbenga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 517 kB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[K     |████████████████████████████████| 302 kB 3.0 MB/s eta 0:00:01
Collecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 2.2 MB/s eta 0:00:01
[?25hInstalling collected packages: joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.3.2 nltk-3.8.1
You should consider upgrading via the '/Users/davidoluyalegbenga/.pyenv/versions/3.9.6/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
