In [11]:
import os
import openai
from string import Template
import json
from timeit import default_timer as timer
import time
from dotenv import load_dotenv
from time import sleep
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor



### Load Enviroment files

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# openai.api_key = os.getenv("OPENAI_KEY")


# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


In [4]:
#Generate KG by for loop iterations. 
def generate_KG(initial_entities, iterations):
    knowledge_graph = {
        "metadata": {
            "Data": "Knowledge Graph",
        },
        "knowledge graph": {
            "entities": initial_entities,
            "relations": {}
        }
    }
    startTime = time.time()
    processed_entities = set()
    for i in range(iterations):
        print("Iteration:",i,", Entity Count:",len(processed_entities), ", Elapsed Time:", round(time.time()-startTime,2),"s")
        current_entities = knowledge_graph["knowledge graph"]["entities"]
        print([entity["entity"] for entity in current_entities])
        for entity in current_entities:
            if entity["entity"] not in processed_entities:
                new_subgraph = generate_subgraph(entity, 4) 
               # print(new_subgraph)
                processed_entities.add(entity["entity"])
                knowledge_graph = updateKG(knowledge_graph, new_subgraph)
                print(knowledge_graph)
                
    return knowledge_graph


In [13]:
#Make call to openAI for subgraph generation
def process_entity(entity, processed_entities, knowledge_graph):
    if entity["entity"] not in processed_entities:
        new_subgraph = generate_subgraph(entity, 4)
        processed_entities.add(entity["entity"])
        return updateKG(knowledge_graph, new_subgraph)


In [15]:
#Generate KG running in parallel
def generate_KG_parallel(initial_entities, iterations, max_workers=4):
    knowledge_graph = {
        "metadata": {
            "Data": "Knowledge Graph",
        },
        "knowledge graph": {
            "entities": initial_entities,
            "relations": {}
        }
    }
    startTime = time.time()
    processed_entities = set()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []

        for i in range(iterations):
            print("Iteration:", i, ", Entity Count:", len(processed_entities), ", Elapsed Time:", round(time.time() - startTime, 2), "s")
            current_entities = knowledge_graph["knowledge graph"]["entities"]
            print([entity["entity"] for entity in current_entities])

            for entity in current_entities:
                future = executor.submit(process_entity, entity, processed_entities, knowledge_graph)
                futures.append(future)

            # Wait for all futures to complete
            for future in futures:
                result = future.result()
                if result:
                    knowledge_graph = result

    return knowledge_graph

# Example usage
# initial_entities = [
#                 {
#                     "entity": "sustainable energy",
#                     "category": "concept"
#                 }
#             ]  
# knowledge_graph_parallel = generate_KG_parallel(initial_entities, iterations=3, max_workers=10)


Iteration: 0 , Entity Count: 0 , Elapsed Time: 0.0 s
['sustainable energy']
Iteration: 1 , Entity Count: 1 , Elapsed Time: 3.69 s
['Solar power', 'Wind power', 'Hydroelectric power', 'Geothermal energy']
Iteration: 2 , Entity Count: 5 , Elapsed Time: 8.93 s
['Solar power', 'Turbine', 'Grid integration', 'Offshore wind farm']


In [5]:
def generate_subgraph(entity, count):
    # Use the openai api to generate a list (of length 'count') of the most related entities
    
    prompt = f"""
    The entity is {entity['entity']} 
    Give me {count} entities and their relationship to {entity['entity']}
    Output should be in example json format: 
        {{
          "entities": [
            {{
                "entity": "ENTITY2",
                "category": "Catetory"
            }}
            ],
        "relations": {{
                    "{entity['entity']}": [
                        {{
                            "Relation": “RELATIONSHIP”,
                            "Object": “ENTITY2”,
                            "Description": “CONTENT DESCRIPTION”
                        }}
                    ],
        }}
        }}
    Any ENTITY2 used in the relations must be part of the entities array section
    """
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    new_subgraph = response_entities.choices[0].message.content

    return new_subgraph

##example usage
##generate_subgraph() ###explanations of inputs / outputs

In [9]:
# Function to Update the Knowledge Graph
def updateKG(knowledge_graph, new_subgraph_str):
    if isinstance(new_subgraph_str, dict):
        new_subgraph = new_subgraph_str
    else:
        new_subgraph = json.loads(new_subgraph_str)
        
    # Merge entities
    knowledge_graph_entities = knowledge_graph.get("entities", [])
    new_entities = new_subgraph.get("entities", [])
    
    for new_entity in new_entities:
        if new_entity not in knowledge_graph_entities:
            knowledge_graph_entities.append(new_entity)
    
    # Merge relations
    knowledge_graph_relations = knowledge_graph.get("relations", {})
    new_relations = new_subgraph.get("relations", {})
    
    for key, value in new_relations.items():
        if key in knowledge_graph_relations:
            for relation in value:
                if relation not in knowledge_graph_relations[key]:
                    knowledge_graph_relations[key].append(relation)
        else:
            knowledge_graph_relations[key] = value

    # Update the knowledge graph
    knowledge_graph["knowledge graph"]["entities"] = knowledge_graph_entities
    knowledge_graph["knowledge graph"]["relations"] = knowledge_graph_relations


    return knowledge_graph

In [16]:
# Example usage
# Modify initial_entity to allow multiple entities 
entities_arr = [
                {
                    "entity": "sustainable energy",
                    "category": "concept"
                }
            ]
# knowledge_graph = generate_KG(entities_arr,iterations=3)
knowledge_graph = generate_KG_parallel(entities_arr,iterations=3,max_workers=10)
# Write the knowledge graph to a JSON file
output_file = '../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json'
with open(output_file, 'w') as json_file:
    json.dump(knowledge_graph, json_file, indent=4)

print(f"Knowledge graph saved to {output_file}")


Iteration: 0 , Entity Count: 0 , Elapsed Time: 0.0 s
['sustainable energy']
Iteration: 1 , Entity Count: 1 , Elapsed Time: 3.2 s
['Solar power', 'Wind power', 'Hydroelectric power', 'Geothermal energy']
Iteration: 2 , Entity Count: 5 , Elapsed Time: 6.19 s
['Geothermal power plant', 'Geothermal heat pump', 'Geothermal reservoir', 'Geothermal energy association']
Knowledge graph saved to ../03_Output/01_Auto KGs/00_Current Versions/knowledge_graph.json


<h4> extract jsons for each entity </h4>

In [23]:
##iterate through keys in entities, and identify all relations for that entity
##generate json including all info about that entity, its relations, all related entitites, and document ids
 

# Folder path containing JSON files

# Folder path containing JSON files
folder_path = '../00_API/00_Merged'

# Dictionary to hold entity-wise relations
entity_relations = defaultdict(list)

# Loop through JSON files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        with open(os.path.join(folder_path, file_name), 'r') as file:
            data = json.load(file)
            
            entities = data['knowledge graph']['entities']
            relations = data['knowledge graph']['relations']
            
            for entity_info in entities:
                entity = entity_info['entity']
                
                if entity in relations:
                    entity_relations[entity].extend(relations[entity])

output_folder_path = '../00_API/01_By-Entity'
# Create separate JSON files for each entity's relations
for entity, relations in entity_relations.items():
    entity_file = f'{entity}.json'
    entity_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": {entity: relations}}}
    
    with open(os.path.join(output_folder_path, entity_file), 'w') as outfile:
        json.dump(entity_data, outfile, indent=4)

print("Entity files created.")




Entity files created.
