In [2]:
import pandas as pd
import os
from openai import AzureOpenAI
import openai
import json
import requests
from dotenv import load_dotenv
from fuzzywuzzy import fuzz
import Levenshtein as lev
import time


# Set up the variables

In [2]:
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

In [4]:
client = AzureOpenAI(
    api_key = os.getenv("api_key_azure"),
    api_version = os.getenv("api_version"),
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT")
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE")


# Define Labels for KG

In [5]:
categories = [
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy",
    "Technology",
    "Crisis",
    "Infrastructure"
]

In [6]:
dimensions = [
    "Policy",
    "Technology",
    "Social",
    "Economic",
    "Finance",
    "Intersectionality"
]

In [7]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

In [8]:
subgroups = [
    "Solar Power",
    "Wind Energy",
    "Hydroelectric Power",
    "Geothermal Energy",
    "Biomass Energy",
    "Smart Grid Technologies",
    "Energy Storage",
    "Energy Efficiency",
    "Microgrids",
    "Electric Vehicles",
    "Green Buildings",
    "Carbon Capture and Storage",
    "Energy Analytics",
    "Sustainable Urban Planning",
    "Renewable Energy Policies"
]


In [9]:
def capitalize_keys(data):
    updated_list = []
    for item in data:
        updated_item = {key.capitalize(): value for key, value in item.items()}
        updated_list.append(updated_item)
    return updated_list

In [10]:
def get_answer(prompt):
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    #print (response)
    response = json.loads(response)
    return response


In [None]:

def metadata_pipeline(start, end, raw_entities):
    metadata_json = []
    while start < len(raw_entities):
        print(start)
        ls = raw_entities[start:end]
        result = get_entity_metadata(ls, categories, dimensions)
        start = end
        end += 10
        if end > len(raw_entities):
            end = len(raw_entities)
        metadata_json.extend(result["entities"])
    return metadata_json

In [None]:
def relations_pipeline():
    relations_json = []
    start_time = time.time()
    while start < len(raw_entities):
        ls = raw_entities[start:end]
        result = create_relations(ls, relation_labels)
        start = end
        end += 2
        if end > len(raw_entities):
            end = len(raw_entities)
        relations_json.extend(result)

    print("--- %s seconds ---" % (time.time() - start_time))


# Set up the prompts

In [11]:
def get_entity_metadata(entities, category_list, dimension_list):
    metadata_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH).
    You will be given an >>>>>EntityList<<<<<, a list of >>>>>CATEGORIES<<<<< and a list of >>>>>DIMENSIONS<<<<<.

   [Task]
   Your task is to create a metadata for each given entity so they can be added to knowledge graph and return a json array.
   You have to add the following attributes to each entity:
    1. 'Entity': The name of given entity.
    2. 'Description': containing summary of the entity.
    3. 'Category': from the given >>>>>CATEGORIES<<<<<.
    4. 'Tags': Multiple tags from a broader taxonomy suitable for entity.
    5. 'Dimension': from the given >>>>>DIMENSIONS<<<<<.
    6. 'Acronym': for the entity if it exists.
    7. 'Importance': Score between 1 to 4 to each entity based on its importance in context of sustainable energy, with 4 being the highest score.

    [Output Format]
    Return only a JSON array for entities with metadata.

    Now create a json array with metadata for the following entities:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>CATEGORIES<<<<<
    {category_list}

    >>>>>CATEGORIES<<<<<
    {dimension_list}


"""

    metadata = get_answer(metadata_prompt)
    return metadata


In [12]:
def create_relations(entities, relation_labels):
    relation_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy.
    You will be given an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   Your task is to extract a set of 8-9 relations for each given entity. These relations will be added to the knowledge graph. Each relation will create following attributed:
    1. 'Subject': subject entity of the relation.
    2. 'Object': object entity of the relation.
    3. 'Relation': label from the given >>>>>RELATION LABELS<<<<<.
    4. 'Description': description of the relation.
    5. 'Importance': score between 1 to 4 to each relation based on its importance in context of sustainable energy, with 4 being the highest score.

    [Example]
    "Subject":"Energy Efficiency Policy",
    "Relation": "implements",
    "Object": "Building Standards",
    "Description": "Energy Efficiency Policy implements rigorous building standards that promote the use of energy-saving designs and materials.",
    "Importance": 4

    [Output Format]
    Return only a JSON array for relations.

    Now create a json array of relations for the following entities. Do not create more than 9 relations for each entity:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}

"""

    relations = get_answer(relation_extraction_prompt)
    print(relations)
    return relations


In [28]:
def add_subgroups(relations, subgroups):
    subgroup_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy.
    You will be given a list of >>>>>RelationsList<<<<< and a list of >>>>>Subgroups<<<<<.

   [Task]
   Your task is to select a >>>>>Subgroups<<<<< to add a 'Subgroup' attribute to each relation in >>>>>RelationsList<<<<<.

    [Output Format]
    Return only a JSON array for 'relations'.

    Now assign subgroups to the following relations:

    [Input]
    >>>>>RelationsList<<<<<
    {relations}

    >>>>>Subgroups<<<<<
    {subgroups}

"""

    relations = get_answer(subgroup_prompt)
    print(relations)
    return relations


# Run the pipeline

In [None]:
# Read the entity list from CSV file
file_path = '../02_Input/00_Manual-KGs/Entity List.csv'
raw_data = pd.read_csv(file_path, delimiter=';')
raw_entities = raw_data['Entity'].str.strip()

file_path = '../03_Output/00_GPT KGs/Entities.csv'
data = pd.read_csv(file_path, delimiter=';')
entities = list(data['Entity'].str.strip())

new_entities = []
for item in raw_entities:
    if item not in entities:
        new_entities.append(item)

#call the pipelne functions

start = 0
end = 10
metadata_pipeline(start, end, new_entities)


Check if there are new entities

Get Metadata for the entities

Create relations from the entity list

In [None]:

start = 0
end = 2

In [None]:
print (len(relations_json))

# Fuzzy matching to identify new object entities

In [None]:
relations_df = pd.DataFrame(relations_json)

In [None]:
# Writing the entities to json file
with open('../03_Output/00_GPT KGs/Entity_Metadata.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

In [None]:
synonyms = {}

for obj in objects:
    if obj.lower() not in entities:
        ratio_list = []
        ratio_list.extend(fuzz.ratio(obj.lower(), entity) for entity in entities)
        score = max(ratio_list)
        index = ratio_list.index(score)
        match = ent_lst[index]
        #print("Object: " + obj + " || Match: " + match + " || Score: " + str(score))

        if score > 70:
            if match in synonyms:
                if obj not in synonyms[match]:
                    synonyms[match].append(obj)
            else:
                synonyms[match] = [obj]

print(synonyms)


In [None]:
entity_data['Synonyms'] = ""
entity_data['Synonyms'] = entity_data['Entity'].map(synonyms).fillna(entity_data['Synonyms'])

In [None]:
# Capitalize keys to ensure consistency
json_entities = json.dumps(metadata_json, indent = 2)

In [None]:
# Write the entities to the CSV file
metadata_df = pd.DataFrame(metadata_json)

metadata_df['Tags'] = metadata_df['Tags'].apply(lambda x: '; '.join(x))
metadata_df['Synonyms'] = metadata_df['Synonyms'].apply(lambda x: '; '.join(x))

final_df = pd.merge(raw_entities, metadata_df, on='Entity', how='left')
final_df.fillna('', inplace=True)
final_df.to_csv('../03_Output/00_GPT KGs/Entity_Metadata.csv', sep=',', index=False)

In [None]:
# Capitalize keys to ensure consistency
json_relations = json.dumps(capitalized_relations, indent=2)
# Writing the entities to json file
with open('../03_Output/00_GPT KGs/Relations.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

In [None]:
#capitalized_relations = capitalize_keys(relations_json)

relations_df.fillna('', inplace=True)
relations_df.to_csv('../03_Output/00_GPT KGs/Relations.csv', sep=',', index=False)


In [None]:
file_path = '../03_Output/00_GPT KGs/Entity_Metadata.csv'
entity_data = pd.read_csv(file_path, delimiter=';')
ent_lst = list(entity_data['Entity'].str.strip())
entities = [x.lower() for x in ent_lst]


# Add Subgroups to the relations

In [14]:
# Writing the entities to json file
with open('../03_Output/00_GPT KGs/Relations.json', "r") as file:
    data = file.read()
    relations = json.loads(data)
    file.close()

In [29]:
relations_with_groups = []
start = 0
end = 5

Add Subgroups for the existing relations

In [39]:

start_time = time.time()
while start < len(relations):
    ls = relations[start:end]
    result = add_subgroups(ls, subgroups)
    print(result)
    start = end
    end += 5
    if end > len(relations):
        end = len(relations)
    relations_with_groups.extend(result['relations'])

print("--- %s seconds ---" % (time.time() - start_time))



{'relations': [{'Subject': 'Load forecasting and demand-side management', 'Relation': 'monitors', 'Object': 'Energy Consumption', 'Description': 'Load forecasting and demand-side management monitors energy consumption to identify trends and patterns.', 'Importance': 2, 'Subgroup': 'Energy Efficiency'}, {'Subject': 'Load forecasting and demand-side management', 'Relation': 'targets', 'Object': 'Energy Efficiency', 'Description': 'Load forecasting and demand-side management targets energy efficiency by optimizing energy usage and reducing waste.', 'Importance': 4, 'Subgroup': 'Energy Efficiency'}, {'Subject': 'Load forecasting and demand-side management', 'Relation': 'addresses', 'Object': 'Peak Demand', 'Description': 'Load forecasting and demand-side management addresses peak demand by managing and reducing energy usage during high-demand periods.', 'Importance': 3, 'Subgroup': 'Energy Efficiency'}, {'Subject': 'multilateral development banks', 'Relation': 'implements', 'Object': 'Ener

In [40]:
type(relations_with_groups)

list

In [9]:
json_relations = json.dumps(json_relations, indent = 2)
with open('../03_Output/00_GPT KGs/Relations_.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

In [5]:
with open('../03_Output/00_GPT KGs/Relations_.json', "r") as file:
    rel = file.read()
    json_relations = json.loads(rel)
    file.close()

In [8]:
type(json_relations)

list