# This notebook creates and updates the KG based on provided entity list in 02_Input
--> Run all the initialization cells in the start
--> Run the 01: Pipeline to update the entities and relations files in '03_Output/00_GPT KGs'
--> Run the 02: Optional pipeline to identify any missing relations in the created due to model errors, and update them

In [1]:
import pandas as pd
import os
from openai import AzureOpenAI
import openai
import json
from dotenv import load_dotenv
from fuzzywuzzy import fuzz
import Levenshtein as lev
import time
import csv


 Set up the variables

In [10]:
load_dotenv()

True

In [11]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

In [12]:
client = AzureOpenAI(
    api_key = os.getenv("api_key_azure"),
    api_version = os.getenv("api_version"),
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT")
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE")


 Define Labels for KG

In [None]:
categories = [
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy",
    "Technology",
    "Crisis",
    "Infrastructure"
]

In [None]:
dimensions = [
    "Policy",
    "Technology",
    "Social",
    "Economic",
    "Finance",
    "Intersectionality"
]

In [2]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

In [13]:
def get_answer(prompt):
    """
    Generate a response based on the given prompt using the OpenAI chat completion model.

    Args:
        prompt (str): The input prompt to generate the response.

    Returns:
        dict: The response from the OpenAI model parsed as a JSON object.
    """
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    #print (response)
    response = json.loads(response)
    return response


In [None]:

def get_metadata(start, end, raw_entities):
    """
    Retrieve metadata for a list of entities in chunks.

    Args:
        start (int): The starting index of the chunk.
        end (int): The ending index of the chunk.
        raw_entities (list): The list of raw entities to process.

    Returns:
        list: A list of metadata for the entities.
    """
    metadata = []
    while start < len(raw_entities):
        print(start)
        ls = raw_entities[start:end]
        result = get_entity_metadata(ls, categories, dimensions)
        start = end
        end += 10
        if end > len(raw_entities):
            end = len(raw_entities)
        metadata.extend(result["entities"])
    return metadata

In [None]:
def get_relations():
    """
    Retrieve relations for a list of entities in chunks.

    Args:
        start (int): The starting index of the chunk.
        end (int): The ending index of the chunk.
        raw_entities (list): The list of raw entities to process.
        relation_labels (list): The list of relation labels to use for creating relations.

    Returns:
        list: A list of relations for the entities.
    """
    relations = []
    start_time = time.time()
    while start < len(raw_entities):
        ls = raw_entities[start:end]
        result = create_relations(ls, relation_labels)
        start = end
        end += 2
        if end > len(raw_entities):
            end = len(raw_entities)
        relations.extend(result)

    print("--- %s seconds ---" % (time.time() - start_time))
    return relations


In [None]:
def extract_synonyms_dict(csv_file_path):
    """
   Extract a dictionary of synonyms from a CSV file. This is used to replace the non-entity objects with matched entities.

   Args:
       csv_file_path (str): The path to the CSV file containing the synonyms.

   Returns:
       dict: A dictionary where each key is a synonym and the corresponding value is the entity.
   """
    synonyms_dict = {}
    with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for row in reader:
            entity = row['Entity']
            synonyms = row['Synonyms']
            if synonyms:
                # Split the synonyms by semicolon and strip any leading/trailing whitespace
                synonym_list = [syn.strip() for syn in synonyms.split(';')]
                for synonym in synonym_list:
                    synonyms_dict[synonym] = entity

    return synonyms_dict

 Set up the prompts

In [None]:
def get_entity_metadata(entities, category_list, dimension_list):
    metadata_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH).
    You will be given an >>>>>EntityList<<<<<, a list of >>>>>CATEGORIES<<<<< and a list of >>>>>DIMENSIONS<<<<<.

   [Task]
   Your task is to create a metadata for each given entity so they can be added to knowledge graph and return a json array.
   You have to add the following attributes to each entity:
    1. 'Entity': The name of given entity.
    2. 'Description': containing summary of the entity.
    3. 'Category': from the given >>>>>CATEGORIES<<<<<.
    4. 'Tags': Multiple tags from a broader taxonomy suitable for entity.
    5. 'Dimension': from the given >>>>>DIMENSIONS<<<<<.
    6. 'Acronym': for the entity if it exists.
    7. 'Importance': Score between 1 to 4 to each entity based on its importance in context of sustainable energy, with 4 being the highest score.

    [Output Format]
    Return only a JSON array for entities with metadata.

    Now create a json array with metadata for the following entities:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>CATEGORIES<<<<<
    {category_list}

    >>>>>CATEGORIES<<<<<
    {dimension_list}

"""

    metadata = get_answer(metadata_prompt)
    return metadata


In [32]:
def create_relations(entities, relation_labels):
    relation_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy.
    You will be given an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   Your task is to extract a set of 8-9 relations for each given entity. These relations will be added to the knowledge graph. Each relation will create following attributed:
    1. 'Subject': subject entity of the relation.
    2. 'Object': object entity of the relation.
    3. 'Relation': label from the given >>>>>RELATION LABELS<<<<<.
    4. 'Description': description of the relation.
    5. 'Importance': score between 1 to 4 to each relation based on its importance in context of sustainable energy, with 4 being the highest score.

    [Example]
    "Subject":"energy efficiency policy",
    "Relation": "implements",
    "Object": "building standards",
    "Description": "Energy efficiency policy implements rigorous building standards that promote the use of energy-saving designs and materials.",
    "Importance": 4

    [Output Format]
    Return only a JSON array for relations.

    Now create a json array of relations for the following entities. Do not create more than 9 relations for each entity:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}

"""

    relations = get_answer(relation_extraction_prompt)
    print(relations)
    return relations


# 01: Pipeline to add the new entities and relations to the KG

In [44]:
# Read the entity list from CSV file
file_path = '../02_Input/00_Manual-KGs/Entity List.csv'
raw_data = pd.read_csv(file_path, delimiter=';')
raw_entities = raw_data['Entity'].str.strip()

file_path = '../03_Output/00_GPT KGs/Entities.csv'
ent_lst = pd.read_csv(file_path, delimiter=';')
entities = list(ent_lst['Entity'].str.strip())

Check if there are new entities, and generate metadata and relations

In [None]:
new_metadata =[]
new_relations = []
new_entities = []
for item in raw_entities:
    if item not in entities:
        new_entities.append(item)

#call the pipeline functions
start = 0
end = 5
while start < len(new_entities):
    metadata = get_metadata(start, end, new_entities)
    relations = get_relations(start, end, new_entities)

    metadata_json = json.loads(metadata)
    relations_json = json.loads(relations)
    new_metadata.append(metadata_json)
    new_relations.extend(relations)

Fuzzy matching to identify synonyms for the new objects in the KG

In [46]:
with open('../03_Output/00_GPT KGs/Entities.json', "r") as file:
    ent = file.read()
    all_entities = json.loads(ent)
    file.close()

In [76]:
with open('../03_Output/00_GPT KGs/Relations.json', "r") as file:
    rel = file.read()
    all_relations = json.loads(rel)
    file.close()

In [106]:
metadata_df = pd.DataFrame(all_entities)
#updated_entities = pd.concat(entities, ent_lst)
relations_df = pd.DataFrame(all_relations)

In [107]:
# Run the loop to compare each object in relations with existing entities,
# If the object doesn't exist, store it as a synonym for the matching entity

synonyms = {}
objects = relations_df['Object']

for obj in objects:
    if obj.lower() not in entities:
        ratio_list = []
        ratio_list.extend(fuzz.ratio(obj.lower(), entity.lower()) for entity in entities)
        score = max(ratio_list)
        index = ratio_list.index(score)
        match = entities[index]
        #print("Object: " + obj + " || Match: " + match + " || Score: " + str(score))

        if score > 75 and score != 100:
            if match in synonyms:
                if obj.lower() not in synonyms[match]:
                    synonyms[match].append(obj.lower())
            else:
                synonyms[match] = [obj.lower()]

print(synonyms)


{'energy efficiency': ['energy efficiency policy', 'energy efficiency programs', 'energy efficiency goals', 'energy efficiency measures', 'energy efficiency targets'], 'renewable energy agencies': ['renewable energy projects', 'renewable energy generation', 'renewable energy mapping', 'renewable energy access', 'renewable energy research', 'renewable energy technologies', 'renewable energy capacity', 'renewable energy sources', 'renewable energy transition', 'renewable energy market', 'renewable energy companies', 'renewable energy adoption', 'renewable energy providers', 'renewable energy standards', 'renewable energy incentives', 'renewable energy performance', 'renewable energy developers', 'renewable energy goals', 'renewable energy experts', 'renewable energy targets'], 'energy markets': ['energy startups', 'energy market', 'energy market barriers'], 'energy poverty indices': ['energy poverty'], 'energy infrastructure services': ['energy infrastructure planning'], 'geospatial info

In [108]:
metadata_df['Synonyms'] = ""
metadata_df['Synonyms'] = ent_lst['Entity'].map(synonyms).fillna(metadata_df['Synonyms'])

In [110]:
metadata_df.fillna('', inplace=True)
if metadata_df['Tags'] is not None or metadata_df['Synonyms'] is not None:
    metadata_df['Tags'] = metadata_df['Tags'].apply(lambda x: '; '.join(x))
    metadata_df['Synonyms'] = metadata_df['Synonyms'].apply(lambda x: '; '.join(x))
print (metadata_df)

                             Entity  \
0                    digital cities   
1                   data ecosystems   
2                       data equity   
3    AI for smart grid optimization   
4                  digital literacy   
..                              ...   
198               carbon neutrality   
199                     blue carbon   
200            passive solar design   
201                vertical funding   
202            energy project cycle   

                                           Description    Category  \
0    A concept that refers to the use of digital te...  Technology   
1    Refers to the interconnected network of indivi...  Technology   
2    The principle of ensuring fair and equal acces...      Social   
3    The application of artificial intelligence tec...  Technology   
4    The ability to use digital technologies and to...       Skill   
..                                                 ...         ...   
198  Carbon neutrality refers to achievin

In [112]:
metadata_df.to_csv('../03_Output/00_GPT KGs/Entities.csv', sep=';', index=False)
json_entities = json.dumps(json.loads(metadata_df.to_json(orient="records")), indent = 2)
# Writing the entities to json file

with open('../03_Output/00_GPT KGs/Entities.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

# Write the relations and entities to json and csv

In [None]:
metadata_df = pd.DataFrame(metadata_json)

metadata_df['Tags'] = metadata_df['Tags'].apply(lambda x: '; '.join(x))
metadata_df['Synonyms'] = metadata_df['Synonyms'].apply(lambda x: '; '.join(x))

final_df = pd.merge(raw_entities, metadata_df, on='Entity', how='left')
final_df.fillna('', inplace=True)
final_df.to_csv('../03_Output/00_GPT KGs/Entities.csv', mode='a', sep=',', index=False, header = False)

In [None]:
json_entities = json.dumps(metadata_json, indent = 2)

# Get the current json files and append the new entities
with open('../03_Output/00_GPT KGs/Entities.json', "r") as file:
    ent = file.read()
    all_entities = json.loads(ent)
    file.close()

all_entities.extend(metadata_json)
metadata_json = json.dumps(all_entities, indent=2)

with open('../03_Output/00_GPT KGs/Entities.json', "w") as output_file:
    output_file.write(metadata_json)
    output_file.close()

In [None]:
relations_df.fillna('', inplace=True)
relations_df.to_csv('../03_Output/00_GPT KGs/Relations.csv', mode='a', sep=',', index=False, header = False)

In [10]:
with open('../03_Output/00_GPT KGs/Relations.json', "r") as file:
    rel = file.read()
    all_relations = json.loads(rel)
    file.close()

In [9]:
all_relations.extend(relations_json)
json_relations = json.dumps(all_relations, indent=2)

with open('../03_Output/00_GPT KGs/Relations.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

# 02: (Optional) Identify any files with missing relations and fill them

In [37]:
path = '../03_Output/01_Auto KGs/00_Current Versions'
files = os.listdir(path)

In [38]:
missing_jsons = []
missing_relations = []
for f in files:
    if f == '.DS_Store' or f == 'knowledge_graph.json':
        continue
    print(f)
    with open('../03_Output/01_Auto KGs/00_Current Versions/' + f, "r") as file:
        data = file.read()
        json_data = json.loads(data)
        file.close()

        if json_data['knowledge graph']['entities'] == []:
            objects = set()
            entity_relations = []
            metadata = json_data['metadata']
            relations = create_relations(metadata['Entity'], relation_labels)
            for rel in relations:
                obj = {}
                objects.add(rel['Object'])
                obj['Relation'] = rel['Relation']
                obj['Object'] = rel['Object']
                obj['Description'] = rel['Description']
                obj['Importance'] = rel['Importance']
                entity_relations.append(obj)
            object_lst = list(objects)
            relation_obj = {metadata['Entity']: entity_relations}
            json_data['knowledge graph']['entities'] = object_lst
            json_data['knowledge graph']['relations'] = relation_obj
            json_output = json.dumps(json_data, indent=2)
            with open('../03_Output/01_Auto KGs/00_Current Versions/' + f, "w") as file:
                file.write(json_output)
                file.close()

            missing_jsons.append(json_data)
            missing_relations.extend(relations)


blue economy.json
internet of energy.json
decentralized energy systems.json
Energy For Development.json
free prior and informed consent.json
technology transfer.json
multi-stakeholder partnerships.json
pumped hydro storage.json
digital twins.json
land use planning.json
reliance of fuel imports.json
energy infrastructure services.json
appropriate and independent oversight.json
hybrid energy storage systems.json
[{'Subject': 'hybrid energy storage systems', 'Relation': 'implements', 'Object': 'energy storage technologies', 'Description': 'Hybrid energy storage systems implement various energy storage technologies to optimize energy usage and improve efficiency.', 'Importance': 4}, {'Subject': 'hybrid energy storage systems', 'Relation': 'funds', 'Object': 'research and development', 'Description': 'Hybrid energy storage systems provide funding for research and development of new energy storage technologies and solutions.', 'Importance': 3}, {'Subject': 'hybrid energy storage systems', 'R

In [42]:
relations_df = pd.DataFrame(missing_relations)
relations_df.fillna('', inplace=True)
relations_df.to_csv('../03_Output/00_GPT KGs/Relations.csv', mode='a', sep=',', index=False, header=False)


In [43]:
with open('../03_Output/00_GPT KGs/Relations.json', "r") as file:
    rel = file.read()
    all_relations = json.loads(rel)
    file.close()
all_relations.extend(missing_relations)
json_relations = json.dumps(all_relations, indent=2)

with open('../03_Output/00_GPT KGs/Relations.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()