# This notebook extracts dataframes of entities from the documents

In [190]:
import pandas as pd
import re
import os
from openai import AzureOpenAI
import openai
import json
from dotenv import load_dotenv
import csv
import nltk
import datetime

In [110]:
load_dotenv()
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("api_key_azure")

# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"
client = AzureOpenAI(
    api_key=os.getenv("api_key_azure"),
    api_version=os.getenv("api_version"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [178]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

In [111]:
def get_answer(prompt):
    """
    Generate a response based on the given prompt using the OpenAI chat completion model.

    Args:
        prompt (str): The input prompt to generate the response.

    Returns:
        dict: The response from the OpenAI model parsed as a JSON object.
    """
    response_entities = openai.chat.completions.create(
        model=openai_deployment,
        temperature=0,
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = response_entities.choices[0].message.content
    #print (response)
    response = json.loads(response)
    return response


In [55]:
ent_path = '../03_Output/00_GPT KGs/Entities.csv'
entities_data = pd.read_csv(ent_path, delimiter=';')

In [56]:
entities = list(entities_data['Entity'])

In [7]:
def read_files(api_folder_path):
    documents = []
    for filename in os.listdir(api_folder_path):
        file_path = os.path.join(api_folder_path, filename)
        with open(os.path.join(file_path), 'r') as file:
            data = file.read()
        documents.append((filename, data))
    return documents


In [19]:

def extract_entities_from_documents(documents, entities):
    """
    Extracts entities from documents and creates a DataFrame with the following columns:
    entity, doc_id, character_index, large_extract.

    Parameters:
    documents (list of tuples): List of tuples where each tuple contains (doc_id, document_text)
    entities (list of str): List of entities to search for in the documents

    Returns:
    pd.DataFrame: DataFrame with the extracted entity information.
    """

    # Create a regex pattern from the list of entities
    entity_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, entities)) + r')\b', re.IGNORECASE)

    data = []

    for doc_id, text in documents:
        for match in entity_pattern.finditer(text):
            entity = match.group()
            char_index = match.start()
            extract_start = max(char_index - 50, 0)
            extract_end = min(char_index + 50, len(text))
            large_extract = text[extract_start:extract_end]

            data.append({
                'entity': entity,
                'doc_id': doc_id,
                'character_index': char_index,
                'large_extract': large_extract
            })

    df = pd.DataFrame(data)
    return df


In [129]:

# Download the necessary nltk data
nltk.download('punkt')

def extract_entities_from_documents(documents, entities):
    """
    Extracts entities from documents and creates a DataFrame with the following columns:
    entity, doc_id, character_index, large_extract.

    Parameters:
    documents (list of tuples): List of tuples where each tuple contains (doc_id, document_text)
    entities (list of str): List of entities to search for in the documents

    Returns:
    pd.DataFrame: DataFrame with the extracted entity information.
    """

    # Create a regex pattern from the list of entities
    entity_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, entities)) + r')\b', re.IGNORECASE)

    data = []

    for doc_id, text in documents:
        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)

        for match in entity_pattern.finditer(text):
            entity = match.group()
            char_index = match.start()

            # Find the sentence containing the entity and its context
            large_extract = None
            for i, sentence in enumerate(sentences):
                if entity in sentence:
                    start_index = max(i - 4, 0)
                    end_index = min(i + 5, len(sentences))
                    large_extract = " ".join(sentences[start_index:end_index])
                    break

            if large_extract:
                data.append({
                    'entity': entity.lower(),
                    'doc_id': doc_id,
                    'character_index': char_index,
                    'large_extract': large_extract
                })

    df = pd.DataFrame(data)
    df = df.drop_duplicates(subset=['entity', 'doc_id', 'large_extract'])
    df.reset_index(drop=True, inplace=True)
    return df

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [119]:
def group_entities_with_details(extracts_df, entities_df):
    """
    Groups rows by entities and returns a dictionary with detailed information.

    Parameters:
    extracts_df (pd.DataFrame): DataFrame containing the columns 'entity', 'doc_id', 'character_index', and 'large_extract'.
    details_df (pd.DataFrame): DataFrame containing additional details about entities with at least an 'Entity' column.

    Returns:
    dict: Dictionary with entity names as keys and details including extracts as values.
    """
    # Initialize an empty dictionary to store the entity details
    entity_dict = {}

    # Iterate through each unique entity in the extracts DataFrame
    for entity, group in extracts_df.groupby('entity'):
        # Initialize the dictionary entry for the entity
        entity_dict[entity] = {
            'name': entity,
            'extracts': group['large_extract'].tolist()
        }

        # Extract matching rows from the details DataFrame
        matching_rows = entities_df[entities_df['Entity'].str.lower() == entity.lower()]

        # Add additional details from the matching rows to the dictionary
        for col in ['Category','Tags', 'Dimensions']:
            if col in matching_rows.columns:
                entity_dict[entity][col] = matching_rows[col].tolist()
    return entity_dict

In [130]:
def group_entities_with_details(extracts_df, entities_df):
    """
    Groups rows by entities and returns a dictionary with detailed information.

    Parameters:
    extracts_df (pd.DataFrame): DataFrame containing the columns 'entity', 'doc_id', 'character_index', and 'large_extract'.
    details_df (pd.DataFrame): DataFrame containing additional details about entities with at least an 'Entity' column.

    Returns:
    dict: Dictionary with sequential numbers as keys and details including extracts as values.
    """
    # Initialize an empty dictionary to store the entity details
    entity_dict = {}
    index = 1

    # Iterate through each unique entity in the extracts DataFrame
    for _, row in extracts_df.iterrows():
        entity = row['entity'].lower()
        large_extract = row['large_extract']

        # Initialize the dictionary entry for the entity
        entity_dict[index] = {
            'name': entity,
            'extract': large_extract,
            'doc_id': row['doc_id'],
            'character_index': row['character_index']
        }

        # Extract matching rows from the details DataFrame
        matching_rows = entities_df[entities_df['Entity'].str.lower() == entity.lower()]

        # Add additional details from the matching rows to the dictionary
        for col in ['Category','Tags', 'Dimensions']:
            if col in matching_rows.columns:
                entity_dict[index][col] = matching_rows[col].tolist()

        index += 1

    return entity_dict

In [184]:
def extract_details(entity, text, category):
    detail_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy.
s
   [Task]
   Your task is to extract detailed and focused information about the given >>>>>Entity<<<<< from the >>>>>Text<<<<<. This information will be used to extract relations to be added in the knowledge graph.
   Return the answer as a JSON object consisting of:
    1. 'Entity': name of the entity.
    2. 'Extract': extracted information about the entity.

    [Output Format]
    Return only the JSON object for relations.

    Now process the following entity and text:

    [Input]
    >>>>>Entity<<<<<
    {entity}

    >>>>>Category<<<<<
    {category}

    >>>>>Text<<<<<
    {text}

    """

    entity_details = get_answer(detail_extraction_prompt)
    print(entity_details)
    return entity_details

In [185]:
def extract_relations(entity, text, relation_labels):
    relation_extraction_prompt = f"""

    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to create a knowledge graph for sustainable energy. You will be given an >>>>>Entity<<<<<, a related >>>>>Text<<<<< and a list of >>>>>RelationLabels<<<<<.
s
   [Task]
   Your task is to extract relations for the the given >>>>>Entity<<<<< from the >>>>>Text<<<<< for the knowledge graph. If no relevant relation can be extracted, return empty object.
   Return the answer as a JSON object consisting of:
    1. 'Subject': given >>>>>Entity<<<<<.
    2. 'Object': object entity of the relation.
    3. 'Relation': label from the given >>>>>RELATION LABELS<<<<<.
    4. 'Description': description of the relation.
    5. 'Importance': score between 1 to 4 to each relation based on its importance in context of sustainable energy, with 4 being the highest score.

    [Output Format]
    [Example]
    "Subject":"energy efficiency policy",
    "Relation": "implements",
    "Object": "building standards",
    "Description": "Energy efficiency policy implements rigorous building standards that promote the use of energy-saving designs and materials.",
    "Importance": 4

    [Output Format]
    Return only a JSON object for relations.

    Now extract relation for the following input:

    [Input]
    >>>>>Entity<<<<<
    {entity}

    >>>>>RelationLabels<<<<<
    {relation_labels}

    >>>>>Text<<<<<
    {text}

    """

    relation = get_answer(relation_extraction_prompt)
    print(relation)
    return relation

In [160]:
documents = read_files('../02_Input/01_SEH_Academy/')

In [163]:
df = extract_entities_from_documents(documents, entities)

In [132]:
df

Unnamed: 0,entity,doc_id,character_index,large_extract
0,energy efficiency,SEA Module 8 - Climate Change Mitigation and A...,4389,"Furthermore, the adaptation strategies integra..."
1,bioenergy,SEA Module 8 - Climate Change Mitigation and A...,5203,A 2021 report by\nthe International Renewable ...
2,carbon neutrality,SEA Module 8 - Climate Change Mitigation and A...,6357,This includes efforts to reduce emissions of g...
3,biomass,SEA Module 8 - Climate Change Mitigation and A...,14933,Energy use in\nagriculture and fishing (1.7%) ...
4,biofuels,SEA Module 8 - Climate Change Mitigation and A...,18032,"Additionally, the incomplete combustion of bio..."
...,...,...,...,...
179,battery storage,SEA Module 2 - Energy Access and Inclusive Ene...,71328,--\nUNDP is implementing similar energy effici...
180,innovative financing,SEA Module 2 - Energy Access and Inclusive Ene...,72639,-- Highlight Eswatini's specific\ncharacteris...
181,energy markets,SEA Module 2 - Energy Access and Inclusive Ene...,80391,Setting realistic and measurable targets can b...
182,multi-stakeholder partnerships,SEA Module 2 - Energy Access and Inclusive Ene...,89217,-- Limit the length to 1 page. -- Emphasi...


In [165]:
entity_dict = group_entities_with_details(df, entities_data)

In [166]:
print (entity_dict)



In [192]:
entity_with_extracts = []

In [198]:
for item in entity_dict.values():
    entity_with_extracts.append(extract_details(item['name'], item['extract'], item['Category']))


{'Entity': 'gender mainstreaming', 'Extract': "Gender mainstreaming refers to the process of integrating a gender perspective into all policies, programs, and activities in order to promote gender equality and address gender disparities. In the context of sustainable energy, gender mainstreaming involves ensuring that women and men have equal access to and benefit from sustainable energy solutions, as well as addressing the specific energy needs and challenges faced by different genders. It includes integrating gender considerations in energy programming, policies, and projects, as well as promoting women's empowerment and participation in the clean energy sector."}
{'Entity': 'marginalized communities', 'Extract': 'Addressing Energy Inequalities Among Marginalized Communities\n\nThe Energy inequalities in marginalized Communities\n\n3.1  The Energy inequalities in marginalized Communities\n\n3.1.1  Remote/Rural Communities\n\n3.1.1.1  Remote Communities across Regions\n\n3.1.2  Indige

In [199]:
len(entity_with_extracts)

293

In [None]:
relations = []

In [205]:

for item in entity_with_extracts:
    relations.append(extract_relations(item['Entity'], item['Extract'], relation_labels))


{'Subject': 'clean cooking', 'Object': 'access', 'Relation': 'increases', 'Description': 'Access to clean cooking has increased.', 'Importance': 3}
{'Subject': 'energy efficiency', 'Object': 'SDG7 target areas', 'Relation': 'addresses', 'Description': 'Improving energy efficiency addresses one of the targeted efforts and actions embodied in SDG7 target areas.', 'Importance': 3}
{'Subject': 'decarbonization', 'Object': 'energy systems', 'Relation': 'requires', 'Description': 'Decarbonization requires a transformation in energy systems away from fossil fuels and toward clean and renewable sources, that promotes long-term resource availability.', 'Importance': 4}
{'Subject': 'biomass', 'Relation': 'can be converted to', 'Object': 'electricity, heat, cooling, and transport fuels, as well as materials and chemicals', 'Description': 'Biomass can be converted through a variety of biological, chemical and thermal processes to produce electricity, heat, cooling, and transport fuels, as well as 

In [206]:
len(relations)

292

In [207]:
timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
kg_json = json.dumps(relations, indent = 2)


with open('../03_Output/02_Augmented KGs/' + str(timestamp) + '.json', 'w') as file:
    file.write(kg_json)
    file.close()
