<h2> Energy Knowledge Graph Pipeline Entities GPT </h2>

In [2]:
##imports
import pandas as pd

## Extract Relevant entities 
Extract the entities in the entities.csv list of entities to guide the knowledge graph construction. 

In [1]:
entities_list_path = '../02_Input/00_Metadata/entities.csv'


In [103]:
entities = [
    "UNDP",
    "United Nations",
    "Afghanistan",
    "Government",
    "Central Statistics Organization",
    "UNAMA",
    "Ministry of Interior",
    "Ministry of Justice",
    "Attorney General's Office",
    "Ministry of Women's Affairs",
    "National Police Programmes",
    "Ministry of Energy and Water (MEW)",
    "Da Afghanistan Breshna Sherkat (DABS)",
    "Africa",
    "Awoken",
    "Anou-Araren",
    "Statistical Committee of Armenia (ArmStat)",
    "Biomass",
    "Burkina Faso",
    "Energy Storage Systems (ESS)",
    "Foreign Direct Investment (FDI)",
    "Geographic Information System (GIS)",
    "Human Development Index (HDI)",
    "Households",
    "International Atomic Energy Agency (IAEA)",
    "LECO",
    "LPG",
    "Ministry of Transport and Civil Aviation(MTCA)",
    "The West African Power Pool (WAPP)",
    "United Nations Sustainable Development Cooperation Framework (UNSDCF)",
    "World Bank"
]

relationships = ["SUPPORTED_BY", "PROVIDE_SUPPORT", "HAS_ENERGY_RESOURCE", "EQUIVALENT_TO", "CONTAINS", "PROPOSED", "PARTICIPATED_IN", "HAS_PARTNERSHIP_WITH", "SOLVED", "HAS_RELATIONSHIP_WITH", "RELATED_TO", "CORRESPONDS_TO", "HAS_PROPERTY", "REPRESENTS", "IS_USED_IN", "DISCOVERED", "FOUND", "IS_SOLUTION_TO", "PROVED", "LIVED_IN", "LIKED", "COLLABORATE_WITH", "CONTRIBUTED_TO", "IMPLIES", "DESCRIBES", "DEVELOPED", "HAS_PROPERTY", "USED_FOR"]


prompt = f"""
Ignore previous commands!!!
You are an International Relations Expert and a scientist helping us extract relevant information from text. 
The task is to extract as many relevant relationships between entities in a given text.
Specifically, the only entity tags you may use are:
ENTITY TAGS = {', '.join(entities)}.
The only relationships you may use are:
RELATIONSHIP = {', '.join(relationships)}

The output should have the following format: "relations": {{
    "Entity": [
        {{
            "Relation": "RELATIONSHIP",
            "Object": "Entity",
            "Description": "Text content "
        }},
    ]
}}

Only find relationship between the ENTITY TAGS in the text. 
Where Entity are strictly from here: {', '.join(entities)}. Only find relationship in the text between these.
Donnot use entities not in the ENTITY TAGS above for the Object.
I am only interested in the relationships in the above format and you can only use what you find in the text provided. Also, you should not provide relationships already found and you should choose less than 100 relationships and the most important ones.
You should only take the most important relationships as the aim is to build a knowledge graph. Rather a few but contextual meaningful than many nonsensical. 
Moreover, you should only tag entities with one of the allowed tags if it truly fits that category and I am only interested in general entities such as "Shape HAS Area" rather than "Shape HAS Area 1".
The input text is the following:
"""



In [132]:
import os
import openai
import time
import json
import os
import re
openai.api_key = ""

def process_gpt4(text):
    """This function prompts the gpt-4 model and returns the output"""
    response = openai.chat.completions.create(
                    model="gpt-4",
                    temperature=0,
                    messages=[
                         {"role": "user", "content": prompt + text},
                    ]
                )
    result = response.choices[0].message.content
    return result

 

def clean_json(json_data):
    # Remove duplicates in entities
    entities = json_data["knowledge graph"]["entities"]
    unique_entities = {entity["entity"]: entity for entity in entities}.values()
    json_data["knowledge graph"]["entities"] = list(unique_entities)
    
    # Remove relations with empty array
    relations = json_data["knowledge graph"]["relations"]
    filtered_relations = {
        key: value for key, value in relations.items() if value
    }
    json_data["knowledge graph"]["relations"] = filtered_relations
    
    return json_data



def split_into_paragraphs(text):
    # Split text into paragraphs based on double line breaks
    paragraphs = text.split('\n\n')  # Modify this based on your paragraph delimiter
    return [paragraphs[i:i+4] for i in range(0, len(paragraphs), 4)]



### Process data and extract entities 
Process the data .txt files and pass through the entities_relation function 
to extract entities and relationships from the text.

In [None]:
import concurrent.futures
import json
import os
import nltk
nltk.download('punkt')

def process_file(file_path):
    with open(file_path, 'r') as file:
        text_content = file.read()

        # Splitting text into groups of four paragraphs
        paragraphs_groups = split_into_paragraphs(text_content)

        all_relations = []
        for group in paragraphs_groups:
            # Join the paragraphs within the group
            text_group = '\n\n'.join(group)
            # Extracting entities and relations for each group of paragraphs
            relations = process_gpt4(text_group)
            all_relations.extend(relations)

        # Constructing JSON data
        json_data = {
            "metadata": {
                # Add your metadata extraction logic here
            },
            "knowledge graph": {
                "entities": [],
                "relations": all_relations
            }
        }
        output_folder_path = '../02_Output/00_By-Document/02_National Policy/AFG'

        cleanedJSON = clean_json(json_data)
        # Save as JSON
        output_file_name = os.path.basename(file_path).replace('.txt', '.json')
        output_file_path = os.path.join(output_folder_path, output_file_name)
        with open(output_file_path, 'w') as output_file:
            json.dump(cleanedJSON, output_file, indent=4)
        return output_file_path

folder_path = '../02_Input/01_Cleaned-Text/02_National Policy/AFG'
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".txt")]

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_file, file_paths)

for result in results:
    print(f"Processed file saved at: {result}")


## TEST FUNCTION

In [156]:
# # Run a quick test
# relations = process_gpt4(f"""
    
 
#  """)

  
# print(relations)

"relations": {
    "Afghanistan": [
        {
            "Relation": "HAS_RELATIONSHIP_WITH",
            "Object": "UNDP",
            "Description": "The standalone segment has significant potential in Afghanistan with the centralized grid not being a cost-effective option in several pockets of the country dominated by mountainous terrain and spatially dispersed communities."
        },
        {
            "Relation": "HAS_RELATIONSHIP_WITH",
            "Object": "Da Afghanistan Breshna Sherkat (DABS)",
            "Description": "Power purchase is governed by the PPA between each project developer and the utility i.e. DABS. Feed-in tariffs may be used to pre-determine tariffs for SPPs."
        },
        {
            "Relation": "HAS_RELATIONSHIP_WITH",
            "Object": "Ministry of Energy and Water (MEW)",
            "Description": "Ministry of Energy and Water: Policy direction for setting up of the SPPs. These shall be classified under separate notification for a “SPP