In [None]:
import pandas as pd
import os
from openai import AzureOpenAI
import openai
import json
import requests
from dotenv import load_dotenv


In [None]:
load_dotenv()

In [None]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

In [None]:
client = AzureOpenAI(
    api_key = os.getenv("api_key_azure"),
    api_version = os.getenv("api_version"),
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT")
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE")


In [None]:
categories = [
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy",
    "Technology",
    "Crisis",
    "Infrastructure"
]

In [None]:
dimensions = [
    "Policy",
    "Technology",
    "Social",
    "Economic",
    "Finance",
    "Intersectionality"
]

In [69]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

In [None]:
# Specify the path to your CSV file
file_path = os.getcwd() + '/Data/Energy KG.csv'
data = pd.read_csv(file_path, delimiter=';')

In [None]:
def capitalize_keys(data):
    updated_list = []
    for item in data:
        updated_item = {key.capitalize(): value for key, value in item.items()}
        updated_list.append(updated_item)
    return updated_list

In [None]:
def get_answer(prompt):
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    print (response)
    response = json.loads(response)
    return response


In [None]:
def get_entity_metadata(entities, category_list, dimension_list):
    metadata_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH).
    You will be given an >>>>>EntityList<<<<<, a list of >>>>>CATEGORIES<<<<< and a list of >>>>>DIMENSIONS<<<<<.

   [Task]
   Your task is to create a metadata for each given entity so they can be added to knowledge graph and return a json array.
   You have to add the following attributes to each entity:
    1. Description that contains summary.
    2. Category from the given >>>>>CATEGORIES<<<<<.
    3. Multiple Tags from a broader taxonomy suitable for entity.
    4. Dimension from the given >>>>>DIMENSIONS<<<<<.
    5. Acronym, if it exists.
    6. Assign a Importance score between 1 to 10 to each entity based on its importance in context fo sustainable energy, with 10 being the highest score.

    [Output Format]
    Return only a JSON array for entities with metadata.

    Now create a json array with metadata for the following entities:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>CATEGORIES<<<<<
    {category_list}

    >>>>>CATEGORIES<<<<<
    {dimension_list}


"""

    metadata = get_answer(metadata_prompt)
    return metadata


In [None]:
metadata_json = []
start = 0
end = 5

In [None]:
while end < len(data):
    ls = data['Entity'][start:end]
    result = get_entity_metadata(ls, categories, dimensions)
    start = end
    end += 5
    if end > len(data):
        end = len(data)
    metadata_json.extend(result["entities"])


In [None]:
print (metadata_json)

In [None]:
print (len(metadata_json))

In [None]:
capitalized_data = capitalize_keys(metadata_json)

In [None]:
metadata_df = pd.DataFrame(capitalized_data)

metadata_df['Tags'] = metadata_df['Tags'].apply(lambda x: ', '.join(x))
final_df = pd.merge(data['Entity'], metadata_df, on='Entity', how='left')

final_df.fillna('', inplace=True)
final_df.to_csv('Entity Metadata.csv', sep=';', index=False)

In [76]:
def create_relations(entities, relation_labels):
    relation_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy
    You will be given an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   Your task is to extract a set of relations for each given entity. These relations will be added to the knowledge graph. Each relation will create following attributed:
    1. 'Subject': subject entity of the relation.
    2. 'Object': object entity of the relation.
    3. 'Relation': label from the given >>>>>RELATION LABELS<<<<<.
    4. 'Description': description of the relation.
    5. 'Importance': score between 1 to 10, with 10 being the highest score.

    [Example]
    "Subject":"Energy Efficiency Policy",
    "Relation": "implements",
    "Object": "Building Standards",
    "Description": "Energy Efficiency Policy implements rigorous building standards that promote the use of energy-saving designs and materials.",
    "Importance": 7

    [Output Format]
    Return only a JSON array for relations.

    Now create a json array of relations for the following entities:

    [Input]
    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}

"""

    relations = get_answer(relation_extraction_prompt)
    print(relations)
    return relations


In [77]:
relations_json = []
start = 0
end = 2
while end < 5:
    ls = data['Entity'][start:end]
    result = create_relations(ls, relation_labels)
    start = end
    end += 5
    if end > len(data):
        end = len(data)
    relations_json.extend(result)


[
    {
        "Subject": "digital cities",
        "Relation": "implements",
        "Object": "Building Standards",
        "Description": "Digital cities implement building standards that promote the use of technology for sustainable and efficient urban development.",
        "Importance": 8
    },
    {
        "Subject": "digital cities",
        "Relation": "funds",
        "Object": "Smart Grid Projects",
        "Description": "Digital cities provide funding for smart grid projects to enhance energy efficiency and grid reliability.",
        "Importance": 6
    },
    {
        "Subject": "digital cities",
        "Relation": "focuses_on",
        "Object": "Renewable Energy Integration",
        "Description": "Digital cities focus on integrating renewable energy sources into their energy systems to reduce carbon emissions.",
        "Importance": 9
    },
    {
        "Subject": "digital cities",
        "Relation": "in",
        "Object": "Energy Transition Initiative",
  

In [78]:
print (relations_json)


[{'Subject': 'digital cities', 'Relation': 'implements', 'Object': 'Building Standards', 'Description': 'Digital cities implement building standards that promote the use of technology for sustainable and efficient urban development.', 'Importance': 8}, {'Subject': 'digital cities', 'Relation': 'funds', 'Object': 'Smart Grid Projects', 'Description': 'Digital cities provide funding for smart grid projects to enhance energy efficiency and grid reliability.', 'Importance': 6}, {'Subject': 'digital cities', 'Relation': 'focuses_on', 'Object': 'Renewable Energy Integration', 'Description': 'Digital cities focus on integrating renewable energy sources into their energy systems to reduce carbon emissions.', 'Importance': 9}, {'Subject': 'digital cities', 'Relation': 'in', 'Object': 'Energy Transition Initiative', 'Description': 'Digital cities participate in the Energy Transition Initiative to accelerate the adoption of sustainable energy practices.', 'Importance': 7}, {'Subject': 'digital ci

In [79]:
capitalized_relations = capitalize_keys(relations_json)
relations_df = pd.DataFrame(capitalized_relations)

relations_df.fillna('', inplace=True)
relations_df.to_csv('Relations.csv', sep=';', index=False)
