In [189]:
import os
import openai
from string import Template
import json
from timeit import default_timer as timer
import time
from dotenv import load_dotenv
from time import sleep
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import shutil
from datetime import datetime
import glob




### Load Enviroment files

In [171]:
# Load environment variables
load_dotenv()

True

In [172]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# openai.api_key = os.getenv("OPENAI_KEY")
# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


<h3> Functions to expand a knowledge graph using parallel call to OpenAI API </h3>

In [173]:
#function to make call to openAI for subgraph generation
#called in the expandKG function for all current entities not yet processed, in parallel
def generateSubgraph(entityObj, count):
    # Use the openai api to generate a list (of length 'count') of the most related entities
    
    prompt = f"""
    The entity is {entityObj['entity']} 
    Give me {count} entities and their relationship to {entityObj['entity']}
    Output should be in example json format: 
        {{
          "entities": [
            {{
                "entity": "ENTITY2",
                "category": "Catetory"
            }}
            ],
        "relations": {{
                    "{entityObj['entity']}": [
                        {{
                            "Relation": “RELATIONSHIP”,
                            "Object": “ENTITY2”,
                            "Description": “CONTENT DESCRIPTION”
                        }}
                    ],
        }}
        }}
    Any ENTITY2 used in the relations must be part of the entities array section
    """
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    new_subgraph = response_entities.choices[0].message.content

    return new_subgraph

##example usage
##generate_subgraph({"entity": "sustainable energy", "category": "concept"},3)

In [174]:
#function to update the Knowledge Graph my merging in the new subgraph
#called in the expandKG function once all futures are complete to merge each result
def updateKG(knowledge_graph, new_subgraph):
    # Merge entities
    existing_entities = set((entity["entity"], entity["category"]) for entity in knowledge_graph["knowledge graph"]["entities"])
    for new_entity in new_subgraph["entities"]:
        if (new_entity["entity"], new_entity["category"]) not in existing_entities:
            knowledge_graph["knowledge graph"]["entities"].append(new_entity)

    # Merge relations
    for new_entity, new_relations in new_subgraph["relations"].items():
        if new_entity in knowledge_graph["knowledge graph"]["relations"]:
            knowledge_graph["knowledge graph"]["relations"][new_entity].extend(new_relations)
        else:
            knowledge_graph["knowledge graph"]["relations"][new_entity] = new_relations

    return knowledge_graph

In [175]:
#function to expand the knowledge graph by calling the openai in parallel for all current entities
#called in the generateKG function once for each expansion iteration
def expandKG(knowledge_graph, kg_parameters):
    with ThreadPoolExecutor(max_workers=kg_parameters["max_workers"]) as executor:
        futures = []

        current_entities = knowledge_graph["knowledge graph"]["entities"]
        print([entity["entity"] for entity in current_entities])

        for entityObj in current_entities:
            entity=entityObj["entity"]
            if entity not in knowledge_graph["knowledge graph"]["relations"].keys():
                future = executor.submit(generateSubgraph, entityObj, kg_parameters["new_relation_count"])#, knowledge_graph)
                futures.append(future)

        # Wait for all futures to complete
        for future in futures:
            result = future.result()
            if result:
                new_subgraph = json.loads(result)
                knowledge_graph = updateKG(knowledge_graph, new_subgraph)
    return knowledge_graph

In [176]:
#function to generate an expanded KG based on an initial KG and parameters
#called only once as primary function to run the full pipeline
def generateKG(knowledge_graph, kg_parameters):

    startTime = time.time()

    for i in range(kg_parameters["iterations"]):
        knowledge_graph=expandKG(knowledge_graph,kg_parameters)
        print("Iteration:", i+1, ", Entity Count:", len(knowledge_graph["knowledge graph"]["entities"]), ", Elapsed Time:", round(time.time() - startTime, 2), "s")
        
    # Write the knowledge graph to a JSON file
    output_file = '../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json'
    with open(output_file, 'w') as json_file:
        json.dump(expanded_knowledge_graph , json_file, indent=4)
    print(f"Knowledge graph saved to {output_file}")
    
    return knowledge_graph


<h4> Run pipeline to expand Knowledge Graph </h4>

In [177]:
# define initial KG and parameters

initial_knowledge_graph = {
        "metadata": {
            "Data": "Knowledge Graph",
        },
        "knowledge graph": {
            "entities": [
                {"entity": "sustainable energy","category": "concept"}
            ],
            "relations": {}
        }
    }
            
kg_parameters={"iterations":3,"new_relation_count":4,"max_workers":10}


#######run pipeline
generateKG(initial_knowledge_graph,kg_parameters)


['sustainable energy']
Iteration: 1 , Entity Count: 5 , Elapsed Time: 3.46 s
['sustainable energy', 'Solar power', 'Wind power', 'Hydroelectric power', 'Geothermal energy']
Iteration: 2 , Entity Count: 20 , Elapsed Time: 7.11 s
['sustainable energy', 'Solar power', 'Wind power', 'Hydroelectric power', 'Geothermal energy', 'Solar panel', 'Solar energy', 'Solar farm', 'Solar radiation', 'Turbine', 'Grid integration', 'Offshore wind farm', 'Water', 'Turbine', 'Generator', 'Dam', 'Geothermal power plant', 'Geothermal heat pump', 'Geothermal reservoir', 'Geothermal energy association']
Iteration: 3 , Entity Count: 67 , Elapsed Time: 13.71 s
Knowledge graph saved to ../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json


{'metadata': {'Data': 'Knowledge Graph'},
 'knowledge graph': {'entities': [{'entity': 'sustainable energy',
    'category': 'concept'},
   {'entity': 'Solar power', 'category': 'Renewable energy'},
   {'entity': 'Wind power', 'category': 'Renewable energy'},
   {'entity': 'Hydroelectric power', 'category': 'Renewable energy'},
   {'entity': 'Geothermal energy', 'category': 'Renewable energy'},
   {'entity': 'Solar panel', 'category': 'Technology'},
   {'entity': 'Solar energy', 'category': 'Renewable energy'},
   {'entity': 'Solar farm', 'category': 'Infrastructure'},
   {'entity': 'Solar radiation', 'category': 'Natural phenomenon'},
   {'entity': 'Turbine', 'category': 'Wind power technology'},
   {'entity': 'Grid integration', 'category': 'Power systems'},
   {'entity': 'Offshore wind farm', 'category': 'Wind power projects'},
   {'entity': 'Water', 'category': 'Natural Resource'},
   {'entity': 'Turbine', 'category': 'Mechanical Device'},
   {'entity': 'Generator', 'category': 'El

<h4> extract jsons for each entity </h4>

In [178]:
# Function to generate a simplified current date and time string formatted as 'MMMDD_HHMM', with the month as the first three letters of its name.

def generateDatetime():
    current_datetime = datetime.now()
    return current_datetime.strftime("%b%d_%H%M")

# # Example usage
# filename_datetime = generateDatetime()
# filename_datetime

In [198]:
##iterate through keys in entities, and identify all relations for that entity
##generate json including all info about that entity, its relations, all related entitites, and document ids
 
# Folder path with JSON file of merged KG 
kg_folder_path = '../00_API/00_Merged'
kg_file_path = "knowledge_graph.json"

# ## Option to copy GPT-KG export to this path (enable these two lines only to replace the active KG version on the API)
# shutil.copy(os.path.join(folder_path, kg_file_path), os.path.join(kg_folder_path, "backup/KG-backup-"+generateDatetime()+".json"))
# ## This copies the existing file into the backup folder
# shutil.copy('../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json', os.path.join(folder_path, kg_file_path))

# Generate dictionary of entity-wise relations
entity_relations = defaultdict(list) # Initialize 

with open(os.path.join(kg_folder_path, kg_file_path), 'r') as file:

    data = json.load(file)

    entities = data['knowledge graph']['entities']
    relations = data['knowledge graph']['relations']

    for entity_info in entities:
        entity = entity_info['entity']

        if entity in relations:
            entity_relations[entity].extend(relations[entity])

            
# Define output path and delete all existing by-entity exports
output_folder_path = '../00_API/01_By-Entity'
for file in glob.glob(output_folder_path+"/*.json"):
    print(file)
    try:
        os.remove(file)
    except OSError as e:
        print(f"Error: {e.strerror}. File: {file}")

# Create separate JSON files for each entity's relations
for entity, relations in entity_relations.items():
    entity_file = f'{entity}.json'
    entity_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": {entity: relations}}}
    
    with open(os.path.join(output_folder_path, entity_file), 'w') as outfile:
        json.dump(entity_data, outfile, indent=4)

print("Entity files created.")


Entity files created.
