In [1]:
import openai
import os
import json
import requests

import numpy as np
import torch, torchvision
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
from transformers import pipeline
from span_marker import SpanMarkerModel
from langdetect import detect
from langdetect import LangDetectException
import re
import pickle
import time
from langchain.text_splitter import SpacyTextSplitter

#nltk.download('punkt')


2024-01-18 21:22:16.695616: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from dotenv import load_dotenv

load_dotenv()

True

Initialize Entity Categories and Relation Labels

In [2]:
categories = [
    
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy"
]


In [3]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

# Setting up OpenAI connection

In [11]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai_deployment = "sdgi-gpt-35-turbo-16k"

#openai.api_key = os.getenv("OPENAI_KEY")


In [12]:

def get_answer(user_question, timeout_seconds):
    messages = [
        {'role': 'user', 'content': user_question},
    ]
    try:
        response = openai.ChatCompletion.create(
            engine="sdgi-gpt-35-turbo-16k", 
            messages=messages,
            temperature=0.2,
            request_timeout = timeout_seconds
            # max_tokens=2000
        )
        return response.choices[0].message["content"]
    except requests.Timeout:
        print(f"Request timed out")
        return []
   

# Entity Extraction using Transformers 


In [15]:
WIKI_API = "https://api-inference.huggingface.co/models/Babelscape/wikineural-multilingual-ner"
BERT_API = "https://api-inference.huggingface.co/models/dslim/bert-base-NER"

headers = {"Authorization": "Bearer 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"}

def query_wiki(payload):
	response = requests.post(WIKI_API, headers=headers, json=payload)
	return response.json()
	

def query_bert(payload):
	response = requests.post(BERT_API, headers=headers, json=payload)
	return response.json()


In [16]:
def query_gpt(text):
    
    entities_prompt = f"""

    You will be given a >>>>>TEXT<<<<<. You have two tasks:
    
    1. Your first task is to detect acronyms with their names and store them in python dictionary.
    2. Your second task is to detect Proper Nouns in the text and store them in python list.
    
    Return a JSON array contaning dictionary and the list.

    >>>>>TEXT<<<<<
    {text}

    
    """
    #start_time = time.time()

    result = get_answer(entities_prompt, 10)
    result = json.loads(result)
    
    #end_time = time.time()
    #elapsed_time = end_time - start_time
    #print (f"TIME TAKEN TO EXECUTE PROMPT: {elapsed_time}")
    return result
    


# Text Pre-Processing 

In [17]:
def split_text_spacy(chunk_size, text):
    
    text_splitter = SpacyTextSplitter(chunk_size=chunk_size)
    sections = text_splitter.split_text(text)
    
    return sections

In [57]:
def get_text_section(limit, text):
    sections_list = []
    length = len(text)
    i = 0

    while i < length - 1:
        j = i + limit

        if j >= length:
            j = length - 1
        elif text[j] not in ('.', '\n', ';'):
            while text[j] not in ('.', '\n', ';'):
                j -= 1
            j += 1

        section = text[i:j]

        if is_valid_section(section):
            sections_list.append(section)
        else: 
            print("INVALID SECTION DETECTED")
            print(section)
            #section_list[-1].extend(section)
        i = j
    
    
    return sections_list

def is_valid_section(section):
    return section and len(section) > 20



In [18]:
def clean_text(input_text):
    # Remove lines with only whitespace
    input_text = re.sub(r'^\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines containing only uppercase text (potential headings)
    input_text = re.sub(r'^\s*[A-Z\s]+\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines with multiple consecutive uppercase words (potential headings)
    input_text = re.sub(r'^\s*(?:[A-Z]+\s*){2,}\s*$', '', input_text, flags=re.MULTILINE)
    
    input_text = re.sub(r'^\s*[A-Za-z\s]+\.{3,}\s*\d+\s*$', '', input_text, flags=re.MULTILINE)

    return input_text

def is_english(line):
    try:
        return detect(line) == 'en'
    except LangDetectException as e:
        print(f"An exception occurred: {e} : {line}")
        return False

In [19]:
folder_path = ('Data/')
file_list = os.listdir(folder_path)

# Filter the list to include only text files (e.g., .txt files)
text_files = [file for file in file_list if file.endswith(".txt")]

print (f"Number of files: {len(text_files)}\n")  
print (text_files)

Number of files: 8

['ALB-NES-2018-EN.txt', 'ALB-NETS-2019-EN.txt', 'ALB-CPD-2021-EN.txt', 'ALB-NREP-2021-EN.txt', 'ALB-NREAP-2016-EN.txt', 'ALB-NREAP-2015-EN.txt', 'ALB-NEP-2013-EN.txt', 'ALB-NECP-2021-EN.txt']


In [20]:
file_path = os.path.join(folder_path, text_files[0])

In [21]:
with open (file_path, 'r') as file:
    raw_text = file.read()
    file.close()

print (f"Original text length: {len(raw_text)}")  

Original text length: 266968


In [22]:
 # Open the file in read mode
with open(file_path, 'r') as file:
    
    pattern = re.compile(r'.*?\.{3}.*?$', re.MULTILINE)
    # Initialize an empty string to store the lines
    raw_text = ''
    
    head = [next(file) for _ in range(11)]
    next(file)
    
    # Iterate over each line in the file
    for line in file:
        # Append the current line to the string
        if not pattern.search(line) and is_english(line):
            raw_text += line
            
print(f"Read text length: {len(raw_text)}") 

text = clean_text(raw_text)      

print(f"Cleaned text length: {len(text)}")


An exception occurred: No features in text. : 

An exception occurred: No features in text. : (2017-2030)

An exception occurred: No features in text. :  

An exception occurred: No features in text. :  

An exception occurred: No features in text. : 2017);

An exception occurred: No features in text. : 7.09.2011)

An exception occurred: No features in text. : 1.12.2017);

An exception occurred: No features in text. : 2015);

An exception occurred: No features in text. : 16.11.2016);

An exception occurred: No features in text. : 25.3%

An exception occurred: No features in text. : 8.2%

An exception occurred: No features in text. : 19.5%

An exception occurred: No features in text. : 37.8%

An exception occurred: No features in text. : 5.3%

An exception occurred: No features in text. :  

An exception occurred: No features in text. : 5

An exception occurred: No features in text. :  

An exception occurred: No features in text. :  

An exception occurred: No features in text. :  800,

An exception occurred: No features in text. :  

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-2030

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-2030

An exception occurred: No features in text. : 68 82 100 35 15 10 7 5 5 5 5 5 5 347.0

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-2030

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-

An exception occurred: No features in text. : 2030

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-

An exception occurred: No features in text. : 2030

An exception occurred: No features in text. : 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2018-

An exception occurred: No features in tex

In [23]:
metadata = {}

# Iterate through the data list
for item in head:
    # Split each element by ':' and strip the resulting strings
    key, value = item.split(':')
    key = key.strip()
    value = value.strip()
    
    # Add the key-value pair to the dictionary
    metadata[key] = value


if 'Exists?' in metadata:
    metadata.pop('Exists?')
print(metadata)


{'File Name': 'ALB-NES-2018-EN', 'Year': '2018', 'Country Name': 'Albania', 'Country Code': 'ALB', 'Category': 'NES', 'Document Title': 'USAID ENERGY STRATEGY FOR ALBANIA (2017-2030)', 'Publication Date': 'January 2018', 'Start Year': '2018', 'End Year': '2018', 'Language': 'EN'}


In [26]:
print (text)

Enhancing Capacity for Low Emission Development 
The author’s views expressed in this publication do not necessarily reflect the views of the United States Agency for International 
Development or the United States Government.
January 2018
This publication was produced for review by the United States 
Agency for International Development. 
It was prepared by RTI international. ENERGY STRATEGY FOR ALBANIA
Development of domestic energy sources, leading to a 
regional integrated and diversified energy system based on 
sustainable development of the economy, ensuring 
security and quality of supply, safety, environmental 
protection and climate action, and increased welfare at 
Prepared by: IRG an RTI Company
Stephen Nash
Thomas O’Conner
David Parish
Fred WidicusTABLE OF CONTENTS
1. CONTEXT AND CURRENT CONDITIONS 1
1.4 CURRENT ENERGY SUPPLY AND CONSUMPTION 5
1.5 LEGAL AND INSTITUTIONAL REFORMS IN THE ENERGY SECTOR 6
2.1 STRATEGIC CONTEXT 10
3.9 NEED FOR A STRATEGIC PLAN FOR OIL AND NATURA

In [24]:
text_sections = split_text_spacy(2000, text)

span-marker is already registered. Overwriting pipeline for task span-marker...
Created a chunk of size 2246, which is longer than the specified 2000


In [25]:
print (f"The number of sections from the text: {len(text_sections)}")

The number of sections from the text: 143


# Entities Post-Processing Methods

In [27]:
# merging the broken entities
def create_entities(lst):
    i = 1
    while i < len(lst):
        if lst[i]["word"].startswith('##'):
            lst[i]["word"] = lst[i-1]["word"] + lst[i]["word"][2:]
            lst[i]["score"] = max(lst[i-1]["score"] , lst[i]["score"])
            del lst[i-1]
        else:
            i += 1
            # todo: return a list of merged entities
            


def apply_threshold(list_, threshold):
    words_list = []
    for item in list_:
        if item['score'] > threshold:  # threshold score to eliminate unimportant entities
            words_list.append(item['word'])
    return words_list


In [28]:
def get_raw(list_):
    output = []
    for sublist in list_:
        new = []
        obj = {}
        for item in sublist:
            #obj = {}
            key = ''.join(filter(str.isalpha, item))
            obj[key]= item
            #obj['raw']= ''.join(filter(str.isalpha, item))
        output.append(obj)
    return output

In [29]:
def merge_extracted_entities(wiki, bert, gpt):
    
    output = []
    dict_ = gpt
    dict_.update(wiki)
    
    wiki_set = set(wiki.keys())
    bert_set = set(bert.keys())
    gpt_set = set(gpt.keys())
    
    A = gpt_set.intersection(bert_set)
    B = bert_set.intersection(wiki_set)
    C = gpt_set.intersection(wiki_set)

    matched = list(A.union(B).union(C))
    
    for i in matched:
        output.append(dict_[i])
        
    return output

In [30]:
def merge_extracted_entities(wiki, bert, gpt):
    
    output = set(wiki.values())
    dict_ = gpt
    
    bert_set = set(bert.keys()) - set(wiki.keys())
    gpt_set = set(gpt.keys())
    
    A = gpt_set.intersection(bert_set)

    matched = list(set(A))
    print ("GPT/BERT: " + str(matched))

    for i in matched:
        output.add(dict_[i])
        
    return output

In [31]:
def validate_entities(list_):
    
    # Define a regular expression pattern to match invalid characters.
    pattern = r'\s*{}\s*'.format(re.escape("’"))
    pattern1 = r'\s*{}\s*'.format(re.escape("/"))
    output_list = []

    for item in list_:
        item = re.sub(pattern, "’", item)
        tem = re.sub(pattern1, "/", item)
            
    return output_list



In [32]:
def save_checkpoint(index, wiki, bert, gpt, acronym):
    checkpoint = {'index': index, 'wiki': wiki, 'bert': bert, 'gpt': gpt, 'acronym': acronym}
    with open('checkpoint.pkl', 'wb') as checkpoint_file:
        pickle.dump(checkpoint, checkpoint_file)

# Function to load the state
def load_checkpoint(length):
    try:
        with open('checkpoint.pkl', 'rb') as checkpoint_file:
            checkpoint = pickle.load(checkpoint_file)
            return checkpoint['index'], checkpoint['wiki'], checkpoint['bert'], 
        checkpoint['gpt'], checkpoint['acronym']
    except FileNotFoundError:
        return 0, [''] * length, [''] * length, [''] * length, {}



In [33]:
text_length = len(text_sections)

In [34]:
# Load the last checkpoint
start_index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms = load_checkpoint(text_length)

In [35]:
start_index = 0
wiki_entity_list = [''] * text_length
bert_entity_list = [''] * text_length
gpt_entity_list = [''] * text_length
acronyms = {}

In [61]:
print (wiki_entity_list[0])

['RTI', 'United States Agency for International Development', 'United States Government', 'Enhancing Capacity for Low Emission Development', 'National Agency for Natural Resources']


In [48]:
# Continue from the last checkpoint

start_time = time.time()
for index in range(71, 83):
    try:
        segment = text_sections[index]
        
        ## WIKINEURAL BILINGUAL MODEL
        wiki_output = query_wiki({
            "inputs": segment,
        })
        create_entities(wiki_output)
        wiki_words = list(set(apply_threshold(wiki_output, 0.7)))
        wiki_entity_list[index] = wiki_words
        print ("WIKI DONE")

        ## BERT BASE MODEL
        bert_output = query_bert({
            "inputs": segment,
        })
        create_entities(bert_output)
        bert_words = list(set(apply_threshold(bert_output, 0.7)))
        bert_entity_list[index] = bert_words
        print ("BERT DONE")


        ## GPT PROMPT
        gpt_output = query_gpt(segment)
        gpt_entity_list[index] = gpt_output['proper_nouns']

        print ("GPT DONE")

        ## Acronyms extraction
        acronyms.update(gpt_output['acronyms'])
    
        
        print(f"NUMBER OF PROCESSED SECTIONS: {index}")

        # Save checkpoint at intervals
        #if index % 5 == 0:
            #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

    except Exception as e:
        print(f"Error processing section {index}: {str(e)}")
        #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

        continue  # Exit the loop in case of an error

end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT ENTITIES from {text_length} section: {elapsed_time}")

WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 71
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 72
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 73
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 74
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 75
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 76
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 77
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 78
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 79
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 80
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 81
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 82
TIME TAKEN TO EXTRACT ENTITIES from 143 section: 31.675292015075684


In [49]:
print (acronyms)

{'USAID': 'United States Agency for International Development', 'USG': 'United States Government', 'RTI': 'Research Triangle Institute', 'IRG': 'International Resources Group', 'AKBN': 'Albanian Energy Regulatory Authority', 'ALBGAZ': 'New public company, ALBGAZ, was established with the functions and responsibilities of a combined transmission and distribution system operator of natural gas.', 'Albpetrol': 'Albanian Petroleum Corporation', 'CESEC': 'Central and South-Eastern European Gas Connectivity', 'DCM': 'Decision of the Council of Ministers', 'EBRD': 'European Bank for Reconstruction and Development', 'ECT': 'Energy Community Treaty', 'EE': 'Energy Efficiency', 'EE Agency': 'Energy Efficiency Agency', 'EED': 'Energy Efficiency Directive (2012/27/EU)', 'EE Fund': 'Energy Efficiency Fund', 'ECS': 'Energy Community Secretariat', 'ENTSO-e': 'European Network of Transmission System Operators-electricity', 'ENTSO-g': 'European Network of Transmission System Operators-gas', 'IEA': 'Int

In [39]:
i = 0
while i < 10:
    #print (text_sections[i])
    print (wiki_entity_list[i])
    print (bert_entity_list[i])
    print (gpt_entity_list[i])
    print ("--------")
    i = i+1

['RTI', 'United States Agency for International Development', 'United States Government', 'Enhancing Capacity for Low Emission Development', 'National Agency for Natural Resources']
['ALBANI', 'RTI', 'OIL', 'United States Agency for International Development', 'an RTI Company', 'United States Government', 'IR', 'Parish', 'Stephen Nash', 'National Agency for Natural Resources']
['Albania', 'RTI', 'Stephen Nash', 'Thomas O’Conner', 'David Parish', 'Fred Widicus', 'IRG', 'USAID', 'USG', 'January']
--------
['EE Fund Energy Efficiency Fund ECS Energy Community Secretariat', 'hours IMF International Monetary Fund INDC Intended Nationally Determined Contribution INSTAT Albanian Institute of Statistics IPP Independent Power Producer IPS Integrated Planning System LEAP Long - range Energy Alternatives Planning System LEDS Low Emission Development Strategies MIE Ministry of Infrastructure and Energy NEEAP National Energy Efficiency Action Plan NREAP National Renewable Energy Action Plan NSDI - 

In [50]:
print (wiki_entity_list[73])

['Power Sector Law', 'EU', 'Albanian', 'Albania']


Processing the Entities 

In [51]:
# Get raw version of entities for comparison
raw_wiki = get_raw(wiki_entity_list)
raw_bert = get_raw(bert_entity_list)
raw_gpt = get_raw(gpt_entity_list)

In [62]:
entity_objects = []
merged = []
i = 0

while i < len(wiki_entity_list):
    merged = merge_extracted_entities(raw_wiki[i], raw_bert[i], raw_gpt[i])
    print (f"\nThe number of matching entities in section {i+1}: {len(merged)}\n")
    print (merged)
    
    print ("\n--------------")

    entity_objects.append(merged)
    
    i = i+1

GPT/BERT: ['StephenNash']

The number of matching entities in section 1: 6

{'RTI', 'United States Agency for International Development', 'United States Government', 'Enhancing Capacity for Low Emission Development', 'Stephen Nash', 'National Agency for Natural Resources'}

--------------
GPT/BERT: ['ECS', 'EEAgency', 'EBRD']

The number of matching entities in section 2: 13

{'EE Fund Energy Efficiency Fund ECS Energy Community Secretariat', 'hours IMF International Monetary Fund INDC Intended Nationally Determined Contribution INSTAT Albanian Institute of Statistics IPP Independent Power Producer IPS Integrated Planning System LEAP Long - range Energy Alternatives Planning System LEDS Low Emission Development Strategies MIE Ministry of Infrastructure and Energy NEEAP National Energy Efficiency Action Plan NREAP National Renewable Energy Action Plan NSDI - II Second National Strategy for Development and Integration OSHEE Distribution System Operator OST Albanian Transmission System Op

#  Categorize entities

Zero Shot Prompt

In [54]:
def categorize_entities(text, entities, categories):
    
    
    categorization_prompt = f"""

    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and >>>>>Categories<<<<<. 
    Your task is to assign a sutiable category to each element of >>>>>EntityList<<<<<.
    
    Return a list of JSON objects of categorized entities. 


    >>>>>TEXT<<<<<
    {text}

    >>>>>Categories<<<<<
    {categories}

    >>>>>EntityList<<<<<
    {entities}
    """

    categorized_entities = get_answer(categorization_prompt, 30)
    categorized_entities = json.loads(categorized_entities)
    
    return (categorized_entities)


# Relation Extraction

Chain of Thought - Prompt

In [55]:
def extract_relation_details(text, entities, relation_labels):
    relation_extraction_prompt = f"""
    
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to extract a 
    Knowledge Graph from the UNDP dataset.
    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   
   Your task is to perform Relation Extraction on the given >>>>>TEXT<<<<< 
   to find relations between elements of provided >>>>>EntityList<<<<<.
   
   Please make sure to read these instructions and constraints carefully.

    [Instructions]
    1. Carefully read and store the >>>>>RelationLabels<<<<<.
    2. Scan the >>>>>TEXT<<<<< to find Named Entites from >>>>>EntityList<<<<< that are related.
    3. Scan the >>>>>RelationLabels<<<<< to select a suitable label to
    describe the relation between the above selected entities. Mark this label as "Relation".
    4. Assign "Subject" and "Object" to entities depending on the selected "Relation"
    selected in previous step to create a tuple.
    5. If available, select a small "Description" from the >>>>>TEXT<<<<< for the above relation.
    6. Assign a Relevance score between 1 to 10 to the extracted relation, with 10 being the most relevant.
    7. Repeat the process to extract remaining relations from >>>>>TEXT<<<<<.
    
    
    [Constraints]
    1. Values of 'Relation' key should belong to >>>>>RelationLabels<<<<<.
    
    [Output Format]
    Provide the result as a JSON array.

    Perform relation extraction on the below:
    
    >>>>>TEXT<<<<<
    {text}

    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}
    
"""

    relations = get_answer(relation_extraction_prompt,60)
    relations = json.loads(relations)

    return relations

In [88]:

## Modified relation for testing

def extract_relation_details_refined(text, entities, relation_labels):
    relation_extraction_prompt = f"""
    
    [Context]
    You are part of the UNDP's Sustainable Energy Hub team, tasked with extracting a Knowledge Graph from a dataset.

    [Task]
    Perform Relation Extraction on the provided text to find relations between entities in the given entity list.

    [Instructions]
    1. Understand the relation labels: {relation_labels}.
    2. Identify relationships in {text} between entities in {entities}.
    3. Use the relation labels to describe each identified relationship.
    4. Label entities as "Subject" and "Object" based on their roles in the relation.
    5. Provide a brief description from the text for each relation.
    6. Assign a relevance score (1-10) to each relation, with 10 being the most relevant.

    [Example]
    Given text: "Sustainability Framework was deployed by UNDP to support communities and organizations
    in achieving sustainability."
    Entities: ["Sustainability Framework", "UNDP"]
    Relation Labels: {relation_labels}
    Output: [{{"Subject": "UNDP", "Relation": "implements", "Object": "Sustainability Framework", 
    "Description": "Sustainability Framework was deployed by Company X to support communities and organizations
    in achieving sustainability.", "Relevance": 8}}]

    [Constraints]
    1. Only use entities from {entities}.
    2. Relation labels must be from {relation_labels}.

    [Output Format]
    Provide the result as a JSON array.
    
    Now, perform relation extraction on the following:
    
    Text:
    {text}

    Entity List:
    {entities}

    Relation Labels:
    {relation_labels}
    """
    relations = get_answer(relation_extraction_prompt,60)
    relations = json.loads(relations)

    return relations

In [56]:
# invert acronyms dict to ease look up
acronyms_dict = {v: k for k, v in acronyms.items()}
print (acronyms_dict)

{'United States Agency for International Development': 'USAID', 'United States Government': 'USG', 'Research Triangle Institute': 'RTI', 'International Resources Group': 'IRG', 'Albanian Energy Regulatory Authority': 'AKBN', 'New public company, ALBGAZ, was established with the functions and responsibilities of a combined transmission and distribution system operator of natural gas.': 'ALBGAZ', 'Albanian Petroleum Corporation': 'Albpetrol', 'Central and South-Eastern European Gas Connectivity': 'CESEC', 'Decision of the Council of Ministers': 'DCM', 'European Bank for Reconstruction and Development': 'EBRD', 'Energy Community Treaty': 'ECT', 'Energy Efficiency': 'EE', 'Energy Efficiency Agency': 'EE Agency', 'Energy Efficiency Directive (2012/27/EU)': 'EED', 'Energy Efficiency Fund': 'EEF', 'Energy Community Secretariat': 'ECS', 'European Network of Transmission System Operators-electricity': 'ENTSO-e', 'European Network of Transmission System Operators-gas': 'ENTSO-g', 'International 

In [59]:
print (len(entity_objects))

11


In [57]:
entities_list = []
relations_list = []

In [58]:
seen_entities = set()

start_time = time.time()

for index, uncategorized_entities in enumerate(entity_objects):
    entities_subset = categorize_entities(text_sections[index], uncategorized_entities, categories)
    
    # Add 'acronym' key to entity list
    for item in entities_subset:
        if item["entity"] not in seen_entities:
            seen_entities.add(item["entity"])
            
            if item["entity"] in acronyms_dict.keys():
                item["acronym"] = acronyms_dict[item["entity"]]
            
            entities_list.append(item)
    
    print ("CATEGORIZED ENTITIES: \n")
    print (entities_subset)
    
    relations_subset = extract_relation_details(text_sections[index], entities_subset, relation_labels)
    
    print ("\n EXTRACTED RELATIONS: \n")
    print (relations_subset)
    
    relations_list.extend(relations_subset)
    

    print ("\n-------------------")


end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT RELATIONS FROM {text_length} SECTIONS: {elapsed_time}")

CATEGORIZED ENTITIES: 

[{'entity': 'Council of Ministers', 'category': 'Organization', 'acronym': 'CoM'}, {'entity': 'Albania', 'category': 'Location'}, {'entity': 'WeBSEDFF', 'category': 'Organization'}, {'entity': 'EBRD', 'category': 'Organization'}]

 EXTRACTED RELATIONS: 

[{'Relation': 'focuses_on', 'Subject': 'Albania', 'Object': 'sustainable development of the economy', 'Description': 'Development of domestic energy sources, leading to a regional integrated and diversified energy system based on sustainable development of the economy', 'Relevance': 8}, {'Relation': 'addresses', 'Subject': 'Albania', 'Object': 'security and quality of supply', 'Description': 'Development of domestic energy sources, leading to a regional integrated and diversified energy system based on sustainable development of the economy', 'Relevance': 7}, {'Relation': 'addresses', 'Subject': 'Albania', 'Object': 'safety', 'Description': 'Development of domestic energy sources, leading to a regional integrate

CATEGORIZED ENTITIES: 

[{'entity': 'Government', 'category': 'Organization'}, {'entity': 'Universal Service', 'category': 'Strategy'}, {'entity': 'Albanian', 'category': 'Location'}, {'entity': 'Universal Service Provider', 'category': 'Organization'}]

 EXTRACTED RELATIONS: 

[{'Relation': 'aligns_with', 'Subject': 'Albanian legislation', 'Object': 'EU energy acquis', 'Description': 'based on independent assessments of Albania’s status, e.g. by the Energy Community Secretariat, and through discussions with sector experts in Albania.', 'Relevance': 8}, {'Relation': 'focuses_on', 'Subject': 'Albanian energy institutions and energy public companies', 'Object': 'capacity building and restructuring', 'Description': 'Particular attention was given to the need for capacity building and restructuring of Albanian energy institutions and energy public companies.', 'Relevance': 9}, {'Relation': 'in', 'Subject': 'Albania', 'Object': 'regional/EU energy market', 'Description': 'based on research 

CATEGORIZED ENTITIES: 

[{'entity': 'Albanian Electricity Sector', 'category': 'Organization'}, {'entity': 'Universal Service Supplier', 'category': 'Organization', 'acronym': 'USS'}, {'entity': 'Government', 'category': 'Organization'}, {'entity': 'Electricity Sector', 'category': 'Organization'}, {'entity': 'ERE', 'category': 'Organization'}]

 EXTRACTED RELATIONS: 

[{'Relation': 'addresses', 'Subject': 'Albanian Electricity Sector', 'Object': 'development of the Albanian energy sector', 'Description': 'Many other reports prepared from the Donor Community (USAID, Energy Secretariat, UNDP, WB, IFC, EU, EBRD, KfW, UNIDO, GIZ, SECO, ADA) related to development of the Albanian energy sector have checked for specific data and reviewed for their analysis.', 'Relevance': 8}, {'Relation': 'addresses', 'Subject': 'Albanian Electricity Sector', 'Object': 'integrated national energy and climate plans', 'Description': 'Draft Energy Community Recommendation on preparing for the development of in

In [92]:
print(len(entities_list))
print (len(relations_list))

31
94


In [95]:
## create a list of names, to check for 

entity_names = set([item['entity'] for item in entities_list])
entity_names.update(acronyms.keys())
entity_names.update(acronyms.values())

In [96]:
print (len(acronyms))

13


In [97]:
print (len(entity_names))

46


In [None]:
j = 0
for i in relations_list:
    print (i['Subject'])
    print (j)
    j = j+1
    

In [105]:
final_relations = [i for i in relations_list if i['Subject'] in entity_names and i['Object'] in entity_names]

In [106]:
print (len(final_relations))

32


In [107]:
print (final_relations)

[{'Relation': 'partners_with', 'Subject': 'UNDP', 'Object': 'United Nations Population Fund', 'Description': 'The UNDP partners with the United Nations Population Fund.', 'Relevance': 8}, {'Relation': 'in', 'Subject': 'UNDP', 'Object': 'Albania', 'Description': 'The UNDP operates in Albania.', 'Relevance': 7}, {'Relation': 'focuses_on', 'Subject': 'UNDP', 'Object': 'Gender Development Index', 'Description': 'The UNDP focuses on the Gender Development Index.', 'Relevance': 9}, {'Relation': 'focuses_on', 'Subject': 'UNDP', 'Object': 'Gender Inequality Index', 'Description': 'The UNDP focuses on the Gender Inequality Index.', 'Relevance': 9}, {'Relation': 'administers', 'Subject': 'UNDP', 'Object': 'United Nations Office for Project Services', 'Description': 'The UNDP administers the United Nations Office for Project Services.', 'Relevance': 8}, {'Relation': 'implements', 'Subject': 'UNDP', 'Object': 'European Union', 'Description': 'The UNDP implements projects for the European Union.', 

# Connecting with DBpedia

In [112]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Define the DBpedia SPARQL endpoint
sparql_endpoint = "http://dbpedia.org/sparql"

# Create a SPARQLWrapper instance
sparql = SPARQLWrapper(sparql_endpoint)

# Function to search for an entity by label and return its DBpedia URI
def search_entity(label):
    query = f"""
    SELECT ?entity
    WHERE {{
      ?entity rdfs:label "{label}"@en.
    }}
    LIMIT 1
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()

    if "results" in results and "bindings" in results["results"] and results["results"]["bindings"]:
        entity_uri = results["results"]["bindings"][0]["entity"]["value"]
        return entity_uri
    else:
        return None

# Function to retrieve and return the abstract or comment of an entity
def retrieve_entity_summary(entity_uri):
    # Try to retrieve the abstract
    abstract_query = f"""
    SELECT ?abstract
    WHERE {{
      <{entity_uri}> dbo:abstract ?abstract.
      FILTER (LANGMATCHES(LANG(?abstract), "en"))
    }}
    """

    sparql.setQuery(abstract_query)
    sparql.setReturnFormat(JSON)

    abstract_results = sparql.query().convert()

    if "results" in abstract_results and "bindings" in abstract_results["results"]:
        for result in abstract_results["results"]["bindings"]:
            abstract = result["abstract"]["value"]
            return abstract

    # If abstract is not found, try to retrieve the comment
    comment_query = f"""
    SELECT ?comment
    WHERE {{
      <{entity_uri}> rdfs:comment ?comment.
      FILTER (LANGMATCHES(LANG(?comment), "en"))
    }}
    """

    sparql.setQuery(comment_query)
    sparql.setReturnFormat(JSON)

    comment_results = sparql.query().convert()

    if "results" in comment_results and "bindings" in comment_results["results"]:
        for result in comment_results["results"]["bindings"]:
            comment = result["comment"]["value"]
            return comment

    # If neither abstract nor comment is found, return None
    return None



In [113]:
import urllib.error

def dbpedia_summary(search_label):
    entity_uri = search_entity(search_label)

    if entity_uri:
        print(f"Entity found with DBpedia URI: {entity_uri}")
        try:
            summary = retrieve_entity_summary(entity_uri)
            if summary:
                return summary
            else:
                print("No abstract or comment found for this entity.")
        except urllib.error.URLError as e:
            print(f"Error: {e}")
    else:
        print(f"No entity found with the label: {search_label}")


In [114]:
def extract_summaries(entities):
    summary_list = []
    list_ = []

    for item in entities:
        summary = dbpedia_summary(item['entity'])
        item['summary'] = summary

    return entities


In [115]:
final_entities = extract_summaries(entities_list)

Entity found with DBpedia URI: http://dbpedia.org/property/undp
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/United_Nations_Population_Fund
Entity found with DBpedia URI: http://dbpedia.org/resource/Gender_Development_Index
Entity found with DBpedia URI: http://dbpedia.org/resource/New_York
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/Gender_Inequality_Index
Entity found with DBpedia URI: http://dbpedia.org/resource/English
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/Albania
Entity found with DBpedia URI: http://dbpedia.org/resource/Category:European_Union
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/United_Nations_Office_for_Project_Services
Entity found with DBpedia URI: http://dbpedia.org/resource/Category:United_Nations_Development_Programme
No 

In [116]:
print (final_entities)

[{'entity': 'UNDP', 'category': 'Organization', 'summary': None}, {'entity': 'United Nations Population Fund', 'category': 'Organization', 'acronym': 'UNFPA', 'summary': 'The United Nations Population Fund (UNFPA), formerly the United Nations Fund for Population Activities, is a UN agency aimed at improving reproductive and maternal health worldwide. Its work includes developing national healthcare strategies and protocols, increasing access to birth control, and leading campaigns against child marriage, gender-based violence, obstetric fistula, and female genital mutilation. The UNFPA supports programs in more than 144 countries across four geographic regions: Arab States and Europe, Asia and the Pacific, Latin America and the Caribbean, and sub-Saharan Africa. Around three-quarters of the staff work in the field. It is a founding member of the United Nations Development Group, a collection of UN agencies and programmes focused on fulfilling the Sustainable Development Goals.'}, {'ent

# Write the output to files

In [121]:
json_relations = json.dumps(final_relations, indent=2)
json_entities = json.dumps(final_entities, indent=2)

In [122]:
print(len(json_relations))

8640


In [123]:
with open('Entities/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

In [124]:
with open('Relations/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

# Add Relations to Spreadsheet for Review

In [132]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Use the credentials from the service account key JSON file you downloaded
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name('energy-moonshot-ai-97aa9045e45f.json', scope)
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1yZ-XQQs52kaI5k9MjvV_CdbgWQi-GazjHHGqQUF8gko/edit')


# Enter relations in the first sheet
sheet = spreadsheet.get_worksheet(0)

# Start row index from 5
start_row_index = 5
index = 1

# Check if there's valid data to insert
if final_relations:
    # Create a list of lists where each inner list represents the values of a row
    batch_relations = []
    for index, row_data in enumerate(final_relations):
        row = [index, row_data['Subject'], row_data['Relation'], row_data.get('Object', ''), 
               row_data.get('Description', ''), row_data.get('Relevance', '')]
        
        batch_relations.append(row)
        index = index + 1

    # Insert the data into the Google Sheet starting from row 5
    sheet.insert_rows(batch_relations, start_row_index)

    print(f"{len(final_relations)} entries added to Google Sheet.")
else:
    print("No data to insert.")
    
    
# Enter entities in the second sheet
sheet = spreadsheet.get_worksheet(1)


# Start row index from 5
start_row_index = 5
index = 1

if final_entities:
    batch_entities = []
    for index, row_data in enumerate(final_entities):
        row = [index, row_data['entity'], row_data['category'], row_data.get('acronym', ''), row_data.get('summary', '')]
        batch_entities.append(row)
        
        index = index + 1
    sheet.insert_rows(batch_entities, start_row_index)

32 entries added to Google Sheet.


# Creating Graph in Neo4j

In [133]:
from py2neo import Graph, Node, Relationship
graph = Graph(uri = 'bolt://localhost:7687',user='neo4j',password=NEO4JPASS)

In [134]:
class Document:
    def __init__(self, metadata, entities, relations):
        self.metadata = metadata
        self.entities = entities
        self.relations = relations

In [135]:
document = Document(metadata, final_entities, final_relations)

In [136]:
# Define a function to create or retrieve a node
def get_or_create_node(label, key, value):
    # Attempt to find an existing node with the given label and key
    existing_node = get_node(label, key, value)
    
    if existing_node:
        return existing_node
    else:
        new_node = Node(label, **{key: value})
        graph.create(new_node)
        return new_node

In [137]:
def get_node(label, key, value):
    node = graph.nodes.match(label, **{key:value}).first()
    return node

In [149]:
# Define a function to insert relations 
def insert_relations_neo4j(document):
    document_node = get_or_create_node("Document", "name", document.metadata['Document Title'] )
            
    for key,value in metadata.items():
        document_node[key] = value
        
    graph.push(document_node)
    
    for item in document.entities:
        
        node = get_or_create_node("Entity", "name", item["entity"])
        node['category'] = item["category"]
        if "acronym" in item:
            node['acronym'] = item["acronym"]
        if "summary" in item:
            node['summary'] = item["summary"]
        graph.push(node)
        graph.create(Relationship(node, "parent_document", document_node))
    
    for item in document.relations:
        subject = get_or_create_node("Entity", "name", item["Subject"])
        obj = get_or_create_node("Entity", "name", item["Object"])
        relation = Relationship(subject, item["Relation"], obj)
        if 'Description' in item:
            relation["Description"] = item["Description"]
        
        # Merge nodes and create relationships
        graph.merge(subject, "Subject", "name")
        graph.merge(obj, "Object", "name")
        graph.create(relation)
        
        # Link the nodes to the project node
        #graph.create(Relationship(subject, "Belongs To", document_node))
        #graph.create(Relationship(obj, "Belongs To", document_node))
        
    

       

In [150]:
# Define a function to insert summaries 
def insert_summary_neo4j(data):
    for item in data:
        node = get_node("Entity", "name", item.name)
        node["Summary"] = item.summary
        graph.push(node)

In [151]:
insert_relations_neo4j(document)
#insert_summary_neo4j(summary_list)

# Delete the checkpoint file 

In [152]:
#os.remove("unwanted-file.txt")

FileNotFoundError: [Errno 2] No such file or directory: 'unwanted-file.txt'