In [1]:
import os
import openai
from string import Template
import json
from timeit import default_timer as timer
import time
from dotenv import load_dotenv
from time import sleep
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import shutil
from datetime import datetime
import glob




### Load Enviroment files

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# openai.api_key = os.getenv("OPENAI_KEY")
# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


<h3> Functions to expand a knowledge graph using parallel call to OpenAI API </h3>

In [4]:
#function to make call to openAI for subgraph generation
#called in the expandKG function for all current entities not yet processed, in parallel
def generateSubgraph(entityObj, count):
    # Use the openai api to generate a list (of length 'count') of the most related entities
    
    prompt = f"""
    The entity is {entityObj['entity']} 
    Give me {count} entities and their relationship to {entityObj['entity']}
    Output should be in example json format: 
        {{
          "entities": [
            {{
                "entity": "ENTITY2",
                "category": "CATEGORY"
            }}
            ],
        "relations": {{
                    "{entityObj['entity']}": [
                        {{
                            "Relation": “RELATIONSHIP”,
                            "Object": “ENTITY2”,
                            "Strength": "1-10 Estimate of how important the relation is in the field of sustainable energy"
                            "Description": “CONTENT DESCRIPTION”
                        }}
                    ],
        }}
        }}
    Any ENTITY2 used in the relations must be listed as on of the entities. If no entities are found, then just leave the lists blank.
    """
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    new_subgraph = response_entities.choices[0].message.content

    return new_subgraph

##example usage
##generate_subgraph({"entity": "sustainable energy", "category": "concept"},3)

In [5]:
#function to update the Knowledge Graph my merging in the new subgraph
#called in the expandKG function once all futures are complete to merge each result
def updateKG(knowledge_graph, new_subgraph):
    # Merge entities
    existing_entities = set((entity["entity"], entity["category"]) for entity in knowledge_graph["knowledge graph"]["entities"])
    for new_entity in new_subgraph["entities"]:
        if (new_entity["entity"], new_entity["category"]) not in existing_entities:
            knowledge_graph["knowledge graph"]["entities"].append(new_entity)

    # Merge relations
    for new_entity, new_relations in new_subgraph["relations"].items():
        if new_entity in knowledge_graph["knowledge graph"]["relations"]:
            knowledge_graph["knowledge graph"]["relations"][new_entity].extend(new_relations)
        else:
            knowledge_graph["knowledge graph"]["relations"][new_entity] = new_relations

    return knowledge_graph

In [6]:
#function to expand the knowledge graph by calling the openai in parallel for all current entities
#called in the generateKG function once for each expansion iteration
def expandKG(knowledge_graph, kg_parameters):
    with ThreadPoolExecutor(max_workers=kg_parameters["max_workers"]) as executor:
        futures = []

        current_entities = knowledge_graph["knowledge graph"]["entities"]
        print([entity["entity"] for entity in current_entities])

        for entityObj in current_entities:
            entity=entityObj["entity"]
            if entity not in knowledge_graph["knowledge graph"]["relations"].keys():
                future = executor.submit(generateSubgraph, entityObj, kg_parameters["new_relation_count"])#, knowledge_graph)
                futures.append(future)

        # Wait for all futures to complete
        for future in futures:
            result = future.result()
            if result:
                new_subgraph = json.loads(result)
                knowledge_graph = updateKG(knowledge_graph, new_subgraph)
    return knowledge_graph

In [7]:
#function to generate an expanded KG based on an initial KG and parameters
#called only once as primary function to run the full pipeline
def generateKG(knowledge_graph, kg_parameters):

    startTime = time.time()

    for i in range(kg_parameters["iterations"]):
        knowledge_graph=expandKG(knowledge_graph,kg_parameters)
        print("Iteration:", i+1, ", Entity Count:", len(knowledge_graph["knowledge graph"]["entities"]), ", Elapsed Time:", round(time.time() - startTime, 2), "s")
        
    # Write the knowledge graph to a JSON file
    output_file = '../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json'
    with open(output_file, 'w') as json_file:
        json.dump(knowledge_graph , json_file, indent=4)
    print(f"Knowledge graph saved to {output_file}")
    
    return knowledge_graph


<h4> Run pipeline to expand Knowledge Graph </h4>

In [13]:
# define initial KG and parameters

initial_knowledge_graph = {
        "metadata": {
            "Data": "Knowledge Graph",
        },
        "knowledge graph": {
            "entities": [
                {"entity": "UNDP","category": "concept"},
                {"entity": "renewable energy systems","category": "concept"},
                {"entity": "just energy transition","category": "concept"},
                {"entity": "equitable energy access","category": "concept"},
                {"entity": "clean cooking","category": "concept"},
                {"entity": "gender equality in energy","category": "concept"}
            ],
            "relations": {}
        }
    }
            
kg_parameters={"iterations":4,"new_relation_count":4,"max_workers":10}


#######run pipeline
generateKG(initial_knowledge_graph,kg_parameters)


['UNDP', 'renewable energy systems', 'just energy transition', 'equitable energy access', 'clean cooking', 'gender equality in energy']
Iteration: 1 , Entity Count: 29 , Elapsed Time: 15.88 s
['UNDP', 'renewable energy systems', 'just energy transition', 'equitable energy access', 'clean cooking', 'gender equality in energy', 'United Nations', 'World Bank', 'Government of Norway', 'European Union', 'Solar panels', 'Wind turbines', 'Hydroelectric power', 'Geothermal energy', 'Renewable energy', 'Energy efficiency', 'Clean technology', 'Climate change', 'Renewable Energy', 'Energy Efficiency', 'Rural Electrification', 'Policy and Regulation', 'biomass', 'improved cookstoves', 'solar energy', 'carbon credits', 'Women', 'Access to energy', 'Policy makers']
Iteration: 2 , Entity Count: 118 , Elapsed Time: 37.98 s
['UNDP', 'renewable energy systems', 'just energy transition', 'equitable energy access', 'clean cooking', 'gender equality in energy', 'United Nations', 'World Bank', 'Government 

{'metadata': {'Data': 'Knowledge Graph'},
 'knowledge graph': {'entities': [{'entity': 'UNDP', 'category': 'concept'},
   {'entity': 'renewable energy systems', 'category': 'concept'},
   {'entity': 'just energy transition', 'category': 'concept'},
   {'entity': 'equitable energy access', 'category': 'concept'},
   {'entity': 'clean cooking', 'category': 'concept'},
   {'entity': 'gender equality in energy', 'category': 'concept'},
   {'entity': 'United Nations', 'category': 'International Organization'},
   {'entity': 'World Bank', 'category': 'Financial Institution'},
   {'entity': 'Government of Norway', 'category': 'Government'},
   {'entity': 'European Union', 'category': 'International Organization'},
   {'entity': 'Solar panels', 'category': 'Technology'},
   {'entity': 'Wind turbines', 'category': 'Technology'},
   {'entity': 'Hydroelectric power', 'category': 'Energy source'},
   {'entity': 'Geothermal energy', 'category': 'Energy source'},
   {'entity': 'Renewable energy', 'c

<h4> extract jsons for each entity </h4>

In [10]:
# Function to generate a simplified current date and time string formatted as 'MMMDD_HHMM', with the month as the first three letters of its name.

def generateDatetime():
    current_datetime = datetime.now()
    return current_datetime.strftime("%b%d_%H%M")

# # Example usage
# filename_datetime = generateDatetime()
# filename_datetime

In [12]:
##iterate through keys in entities, and identify all relations for that entity
##generate json including all info about that entity, its relations, all related entitites, and document ids
 
# Folder path with JSON file of merged KG 
#kg_folder_path = '../00_API/00_Merged'##only enable if the KG has been already committed/copied to API

kg_folder_path = "../03_Output/01_Auto KGs/00_Current Versions/"
kg_file_path = "knowledge_graph.json"

# ## Option to copy GPT-KG export to this path (enable these two lines only to replace the active KG version on the API)
# shutil.copy(os.path.join(folder_path, kg_file_path), os.path.join(kg_folder_path, "backup/KG-backup-"+generateDatetime()+".json"))
# ## This copies the existing file into the backup folder
# shutil.copy('../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json', os.path.join(folder_path, kg_file_path))

# Generate dictionary of entity-wise relations
entity_relations = defaultdict(list) # Initialize 

with open(os.path.join(kg_folder_path, kg_file_path), 'r') as file:

    data = json.load(file)

    entities = data['knowledge graph']['entities']
    relations = data['knowledge graph']['relations']

    for entity_info in entities:
        entity = entity_info['entity']

        if entity in relations:
            entity_relations[entity].extend(relations[entity])

            
# Define output path and delete all existing by-entity exports
output_folder_path = kg_folder_path+'01_By-Entity'
for file in glob.glob(output_folder_path+"/*.json"):
    print(file)
    try:
        os.remove(file)
    except OSError as e:
        print(f"Error: {e.strerror}. File: {file}")

# Create separate JSON files for each entity's relations
for entity, relations in entity_relations.items():
    entity_file = f'{entity}.json'
    entity_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": {entity: relations}}}
    
    with open(os.path.join(output_folder_path, entity_file), 'w') as outfile:
        json.dump(entity_data, outfile, indent=4)

print("Entity files created.")


../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Wind speed.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Generator.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Hazardous waste.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Universities.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Nitrous oxide.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Photovoltaic cells.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Natural Gas.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Isobutane.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/renewable energy certificate.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Internet of Things.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Employee.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/algae.json
../03_Output/01_Auto KGs/00_Current Versions/01_By-Entity/Temperature gradie