# This notebook creates the entity json files for subgrpahs and stores them in 00_API

In [1]:
import pandas as pd
import os
from openai import AzureOpenAI
import openai
import json
from dotenv import load_dotenv

import csv
from collections import defaultdict


In [2]:
load_dotenv()
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"
client = AzureOpenAI(
    api_key=os.getenv("api_key_azure"),
    api_version=os.getenv("api_version"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE")


 Initialize the functions

In [52]:
def get_answer(prompt):
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    response = json.loads(response)
    return response

def find_max_scores(lst):
    return [i for i, x in enumerate(lst) if x > 10]


In [58]:
# Prompt to create subelements for each entity

def create_subelements(entity):
    subelement_relations_prompt = f"""
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy. Your expertise in sustainable energy and knowledge extraction is critical for this task.

    [Task]
    You will be given an entity related to sustainable energy. Your task is to identify if the given entity has any sub-elements. A sub-element can be a sub-component, part, type, category, or example of the parent entity.

    1. If no sub-elements exist, return an empty JSON array for both 'sub-elements' and 'relations'.
    2. If sub-elements exist, extract a maximum of 5 relevant sub-elements. For each sub-element, create a relationship with the parent entity.

    Each relationship should have the following attributes:
    1. 'Sub-element': The sub-component, part, type, category, or example of the entity.
    2. 'Relation': The fixed phrase 'is a sub-element of'.
    3. 'Parent': The given entity (subject) of the relation.
    4. 'Description': A explanation of the relationship, including how the sub-element fits or functions as part of the parent entity.
    5. 'Importance': A score between 1 to 4 indicating the importance of this relationship in the context of sustainable energy, with 4 being the highest score.

    [Example]
    For the entity "UNDP accelerator labs":
    {{
        "sub-elements": [
            "solutions mapping"
        ],
        "relations": [
            "Sub-element": "solutions mapping",
                "Relation": "is a sub-element of",
                "Parent": "UNDP accelerator labs",
                "Description": "Solutions mapping is a fundamental aspect of the UNDP Accelerator Labs, involving the identification, mapping, and analysis of grassroots solutions and innovations within the community. It aims to understand local contexts and leverage indigenous knowledge to address development challenges.",
                "Importance": 4
        ]
    }}

    [Output Format]
    Return a JSON object with two arrays: 'sub-elements' and 'relations'. Each element in the 'relations' array should be a JSON object with the specified attributes.

    Now, create a JSON object for the following entity:
    [Input]
    >>>>>> Entity <<<<<<
    {entity}
"""

    relations = get_answer(subelement_relations_prompt)
    return relations


In [12]:
with open('../03_Output/01_Auto KGs/02_Replaced Relations/knowledge_graph_2.json', "r") as file:
    data = file.read()
    kg_nodes = json.loads(data)
    file.close()

file_path = '../03_Output/00_GPT KGs/Relations_replaced.csv'
rel_lst = pd.read_csv(file_path, delimiter=';')



In [8]:
# Function to get nested level of relations for a parent node
# parent_imp: importance score of parent
def get_level_relations(parent_score, child_df):
    scores = {}
    arr = [parent_score] * len(child_df)                #create an empty array to store importance scores
    scores_lst = child_df['Importance'].tolist()
    rows = json.loads(child_df.to_json(orient="records"))
    for index, val in enumerate(scores_lst):
        arr[index] *= val
    scores[obj] = arr
    indexes = find_max_scores(arr)
    child_df = child_df.reset_index(drop=True)
    relations_df = child_df.iloc[indexes]
    relations_js = [rows[val] for val in indexes]
    return [relations_js, relations_df]

In [9]:
def unique_subjects(input_list, n, level):
    if len(input_list) <= n:
        return input_list
    subject_count = defaultdict(int)
    unique_list = []
    if level == 2:
        for item in input_list:
            subject = item['Subject']
            if len(unique_list) < n:
                if subject_count[subject] < 1 or (subject_count[subject] < 2 and len(unique_list) < n):
                    unique_list.append(item)
                    subject_count[subject] += 1
            else:
                break
    if level == 3:
        for item in input_list:
            subject = item['Subject']
            if len(unique_list) < n:
                if subject_count[subject] < 1:
                    unique_list.append(item)
                    subject_count[subject] += 1
            else:
                break
    return unique_list

Run the below cell to generate the levels and subelements for all entities
Tip: To adjust the number of entities to be processed change the length of kg_nodes

In [38]:
output = []

In [None]:

for item in kg_nodes: # Setting kg_nodes[:20] will generate json files for first 20 entities
    relations_2 = []        # set of derived level 2 relations
    relations_3 = []
    df2 = pd.DataFrame()
    df3 = pd.DataFrame()
    subgroups = []
    new_node = {}
    subject = item['metadata']['Entity']
    relations = item['knowledge graph']['relations'][subject]
    new_node['metadata'] = item['metadata']
    new_node['knowledge graph'] = {}
    new_node['knowledge graph'] ['entities'] = item['knowledge graph']['entities']

    for index, rel in enumerate(relations):
        obj = rel['Object']
        level2 = rel_lst.loc[rel_lst['Subject'].apply(lambda x: x.lower() if isinstance(x, str) else x) == obj.lower()]
        if len(level2) != 0:
            lst2 = get_level_relations(rel['Importance'], level2)
            relations_2.extend(lst2[0])

            df2.update(lst2[1])
            for rel2 in relations_2:
                level3 = rel_lst.loc[rel_lst['Subject'].apply(lambda x: x.lower() if isinstance(x, str) else x) == rel2['Object'].lower()]
                if len(level3) != 0:

                    lst3 = get_level_relations(rel['Importance'], level3)
                    relations_3.extend(lst3[0])
                    df3.update(lst3[1])
    # select different relations in levels
    selected_relations_2 = unique_subjects(relations_2, 8, 2)
    selected_relations_3 = unique_subjects(relations_3, 4, 3)

    new_node['knowledge graph']['relations'] = {}
    new_node['knowledge graph']['relations']['level 1'] = relations
    new_node['knowledge graph']['relations'] ['level 2'] = selected_relations_2
    new_node['knowledge graph']['relations'] ['level 3'] = selected_relations_3
    # get subgroups for the entity
    gpt_output = create_subelements(subject)
    subelements = gpt_output['sub-elements']
    print (gpt_output)
    subelement_relations = gpt_output['relations']
    new_node['knowledge graph']['sub-elements'] = subelements
    new_node['knowledge graph']['subelement_relations'] = subelement_relations
    output.append(new_node)


In [64]:
print (len(output))

202


In [67]:
kg_json = json.dumps(output, indent = 2)

with open('../03_Output/01_Auto KGs/02_Replaced Relations/Nested Relations_3.json', 'w') as file:
    file.write(kg_json)
    file.close()

In [68]:
for item in output:
    name = item['metadata']['Entity']
    ent_json = json.dumps(item, indent = 2)
    with open('../00_API/'+ name +'.json', 'w') as file:
        file.write(ent_json)
        file.close()


# Replacing fuzzy matched entities

In [5]:
ent_path = '../03_Output/00_GPT KGs/Entities.csv'
entities = pd.read_csv(ent_path, delimiter=';')

In [6]:
rel_path = '../03_Output/00_GPT KGs/Relations.csv'
relations = pd.read_csv(rel_path, delimiter=';')

In [7]:

def extract_synonyms_dict(csv_file_path):
    synonyms_dict = {}

    with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for row in reader:
            entity = row['Entity']
            synonyms = row['Synonyms']
            if synonyms:
                # Split the synonyms by semicolon and strip any leading/trailing whitespace
                synonym_list = [syn.strip() for syn in synonyms.split(';')]
                for synonym in synonym_list:
                    synonyms_dict[synonym] = entity

    return synonyms_dict



In [8]:
synonyms_dict = extract_synonyms_dict(ent_path)
print(synonyms_dict)

{'data security': 'data equity', 'data quality': 'data equity', 'internet of things': 'internet of energy', 'net zero energy': 'internet of energy', 'geographic information system': 'geospatial information systems', 'energy planning': 'energy modeling', 'inclusive finance': 'digital inclusive finance', 'renewable energy system': 'decentralized energy systems', 'energy efficiency policy': 'energy efficiency', 'energy efficiency programs': 'energy efficiency', 'energy efficiency goals': 'energy efficiency', 'energy efficiency performance': 'energy efficiency', 'energy efficiency initiatives': 'energy efficiency', 'energy efficiency measures': 'energy efficiency', 'energy efficiency improvements': 'energy efficiency', 'energy efficiency targets': 'energy efficiency', 'carbon footprint': 'carbon pricing', 'carbon pricing mechanisms': 'carbon pricing', 'circular economy principles': 'circular economy', 'circular economy solution': 'circular economy', 'circular economy goals': 'circular econ

In [9]:
def update_csv_and_create_json(csv_file_path, updated_csv_file_path, json_file_path, synonyms_dict):
    updated_entities = []

    with open(csv_file_path, mode='r', encoding='utf-8') as csvfile, \
         open(updated_csv_file_path, mode='w', encoding='utf-8', newline='') as updated_csvfile:

        reader = csv.DictReader(csvfile, delimiter=',')
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(updated_csvfile, fieldnames=fieldnames, delimiter=';')
        writer.writeheader()

        for row in reader:
            entity = row['Object'].lower()
            if entity in synonyms_dict:
                row['Object'] = synonyms_dict[entity]
            else:
                row['Object'] = entity

            writer.writerow(row)
            updated_entities.append(row)

    with open(json_file_path, mode='w', encoding='utf-8') as jsonfile:
        json.dump(updated_entities, jsonfile, indent=4)



In [10]:
# Example usage:

updated_csv_file_path = '../03_Output/00_GPT KGs/Relations_replaced.csv'
json_file_path = '../03_Output/00_GPT KGs/Relations_replaced.json'

update_csv_and_create_json(rel_path, updated_csv_file_path, json_file_path, synonyms_dict)

print("CSV updated and JSON file created successfully.")


CSV updated and JSON file created successfully.
