In [2]:
import pandas as pd
import os
from openai import AzureOpenAI
import openai
import json
import requests
from dotenv import load_dotenv


# Set up the variables

In [3]:
load_dotenv()

True

In [4]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

In [5]:
client = AzureOpenAI(
    api_key = os.getenv("api_key_azure"),
    api_version = os.getenv("api_version"),
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT")
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE")


# Define Labels for KG

In [6]:
categories = [
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy",
    "Technology",
    "Crisis",
    "Infrastructure"
]

In [7]:
dimensions = [
    "Policy",
    "Technology",
    "Social",
    "Economic",
    "Finance",
    "Intersectionality"
]

In [8]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

In [9]:
def capitalize_keys(data):
    updated_list = []
    for item in data:
        updated_item = {key.capitalize(): value for key, value in item.items()}
        updated_list.append(updated_item)
    return updated_list

In [10]:
def get_answer(prompt):
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    print (response)
    response = json.loads(response)
    return response


# Set up the prompts

In [11]:
def get_entity_metadata(entities, category_list, dimension_list):
    metadata_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH).
    You will be given an >>>>>EntityList<<<<<, a list of >>>>>CATEGORIES<<<<< and a list of >>>>>DIMENSIONS<<<<<.

   [Task]
   Your task is to create a metadata for each given entity so they can be added to knowledge graph and return a json array.
   You have to add the following attributes to each entity:
    1. 'Entity': The name of given entity.
    2. 'Description': containing summary of the entity.
    3. 'Category': from the given >>>>>CATEGORIES<<<<<.
    4. 'Tags': Multiple tags from a broader taxonomy suitable for entity.
    5. 'Dimension': from the given >>>>>DIMENSIONS<<<<<.
    6. 'Acronym': for the entity if it exists.
    7. 'Importance': Score between 1 to 4 to each entity based on its importance in context fo sustainable energy, with 4 being the highest score.

    [Output Format]
    Return only a JSON array for entities with metadata.

    Now create a json array with metadata for the following entities:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>CATEGORIES<<<<<
    {category_list}

    >>>>>CATEGORIES<<<<<
    {dimension_list}


"""

    metadata = get_answer(metadata_prompt)
    return metadata


In [18]:
def create_relations(entities, relation_labels):
    relation_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy
    You will be given an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   Your task is to extract a set of 8-9 relations for each given entity. These relations will be added to the knowledge graph. Each relation will create following attributed:
    1. 'Subject': subject entity of the relation.
    2. 'Object': object entity of the relation.
    3. 'Relation': label from the given >>>>>RELATION LABELS<<<<<.
    4. 'Description': description of the relation.
    5. 'Importance': score between 1 to 4, with 4 being the highest score.

    [Example]
    "Subject":"Energy Efficiency Policy",
    "Relation": "implements",
    "Object": "Building Standards",
    "Description": "Energy Efficiency Policy implements rigorous building standards that promote the use of energy-saving designs and materials.",
    "Importance": 4

    [Output Format]
    Return only a JSON array for relations.

    Now create a json array of relations for the following entities. Do not create more than 9 relations for each entity:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}

"""

    relations = get_answer(relation_extraction_prompt)
    print(relations)
    return relations


# Run the pipeline

In [13]:
# Read the entity list from CSV file
file_path = os.getcwd() + '/Data/Energy KG.csv'
data = pd.read_csv(file_path, delimiter=';')
raw_entities = data['Entity'].str.strip()


Get Metadata for the entities

In [105]:
metadata_json = []
start = 0
end = 10

In [116]:
while start < len(raw_entities):
    ls = raw_entities[start:end]
    result = get_entity_metadata(ls, categories, dimensions)
    start = end
    end += 10
    if end > len(raw_entities):
        end = len(raw_entities)
    metadata_json.extend(result["entities"])


{
    "entities": [
        {
            "Entity": "passive solar design",
            "Description": "A design approach that utilizes the sun's energy to provide heating, cooling, and lighting for buildings.",
            "Category": "Technology",
            "Tags": ["Renewable Energy", "Building Design", "Energy Efficiency"],
            "Dimension": "Energy",
            "Acronym": null,
            "Importance": 3
        },
        {
            "Entity": "vertical funding",
            "Description": "A funding mechanism that focuses on specific sectors or industries.",
            "Category": "Finance",
            "Tags": ["Investment", "Funding", "Sector-specific"],
            "Dimension": "Economic",
            "Acronym": null,
            "Importance": 2
        },
        {
            "Entity": "energy project cycle",
            "Description": "The stages involved in the development and implementation of an energy project.",
            "Category": "Technology",
     

In [117]:
print (metadata_json)



In [118]:
print (len(metadata_json))

203


In [127]:
# Capitalize keys to ensure consistency
capitalized_metadata = capitalize_keys(metadata_json)
json_entities = json.dumps(capitalized_metadata, indent = 2)

In [124]:
# Writing the entities to json file
with open('Data/' + 'Entity_Metadata.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

In [128]:
# Write the entities to the CSV file
metadata_df = pd.DataFrame(capitalized_metadata)

metadata_df['Tags'] = metadata_df['Tags'].apply(lambda x: ', '.join(x))
final_df = pd.merge(raw_entities, metadata_df, on='Entity', how='left')

final_df.fillna('', inplace=True)
final_df.to_csv('Data/Entity_Metadata.csv', sep=';', index=False)

Create relations from the entity list

In [19]:
relations_json = []
start = 0
end = 2


In [36]:
import time

In [45]:
start

150

In [46]:
start_time = time.time()
while start < len(raw_entities):
    ls = raw_entities[start:end]
    result = create_relations(ls, relation_labels)
    start = end
    end += 2
    if end > len(raw_entities):
        end = len(raw_entities)
    relations_json.extend(result)

print("--- %s seconds ---" % (time.time() - start_time))


[
    {
        "Subject": "cultural impact assessments",
        "Relation": "implements",
        "Object": "Building Standards",
        "Description": "Cultural impact assessments implement building standards that consider the cultural significance of a project.",
        "Importance": 4
    },
    {
        "Subject": "cultural impact assessments",
        "Relation": "funds",
        "Object": "Community Engagement Programs",
        "Description": "Cultural impact assessments provide funding for community engagement programs to ensure the inclusion of diverse cultural perspectives.",
        "Importance": 3
    },
    {
        "Subject": "cultural impact assessments",
        "Relation": "focuses_on",
        "Object": "Indigenous Communities",
        "Description": "Cultural impact assessments focus on understanding and addressing the impacts of projects on indigenous communities.",
        "Importance": 4
    },
    {
        "Subject": "cultural impact assessments",
       

In [47]:
print (len(relations_json))

1440


In [50]:
# Capitalize keys to ensure consistency
capitalized_relations = capitalize_keys(relations_json)
json_relations = json.dumps(capitalized_relations, indent=2)
# Writing the entities to json file
with open('Data/' + 'Relations.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()


In [51]:
capitalized_relations = capitalize_keys(relations_json)
relations_df = pd.DataFrame(capitalized_relations)

relations_df.fillna('', inplace=True)
relations_df.to_csv('Data/Relations.csv', sep=';', index=False)
