In [None]:
import openai
import os
import json
import requests

import numpy as np
import torch, torchvision
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
from transformers import pipeline
from span_marker import SpanMarkerModel
from langdetect import detect
from langdetect import LangDetectException
import re
import pickle
import time
from langchain.text_splitter import SpacyTextSplitter

#nltk.download('punkt')


In [None]:
from dotenv import load_dotenv

load_dotenv()

Initialize Entity Categories and Relation Labels

In [None]:
categories = [
    
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy"
]


In [None]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

# Setting up OpenAI connection

In [None]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai_deployment = "sdgi-gpt-35-turbo-16k"

neo4j_pass = os.getenv("NEO4JPASS")
#openai.api_key = os.getenv("OPENAI_KEY")


In [None]:

def get_answer(user_question, timeout_seconds):
    messages = [
        {'role': 'user', 'content': user_question},
    ]
    try:
        response = openai.ChatCompletion.create(
            engine="sdgi-gpt-35-turbo-16k", 
            messages=messages,
            temperature=0.2,
            request_timeout = timeout_seconds
            # max_tokens=2000
        )
        return response.choices[0].message["content"]
    except requests.Timeout:
        print(f"Request timed out")
        return []
   

# Entity Extraction using Transformers 


In [None]:
WIKI_API = "https://api-inference.huggingface.co/models/Babelscape/wikineural-multilingual-ner"
BERT_API = "https://api-inference.huggingface.co/models/dslim/bert-base-NER"

headers = {"Authorization": "Bearer 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"}

def query_wiki(payload):
	response = requests.post(WIKI_API, headers=headers, json=payload)
	return response.json()
	

def query_bert(payload):
	response = requests.post(BERT_API, headers=headers, json=payload)
	return response.json()


In [None]:
def query_gpt(text):
    
    entities_prompt = f"""

    You will be given a >>>>>TEXT<<<<<. You have two tasks:
    
    1. Your first task is to detect acronyms with their names and store them in python dictionary.
    2. Your second task is to detect Proper Nouns in the text and store them in python list.
    
    Return a JSON array contaning dictionary and the list.

    >>>>>TEXT<<<<<
    {text}

    
    """
    #start_time = time.time()

    result = get_answer(entities_prompt, 10)
    result = json.loads(result)
    
    #end_time = time.time()
    #elapsed_time = end_time - start_time
    #print (f"TIME TAKEN TO EXECUTE PROMPT: {elapsed_time}")
    return result
    


# Text Pre-Processing 

In [None]:
def split_text_spacy(chunk_size, text):
    
    text_splitter = SpacyTextSplitter(chunk_size=chunk_size)
    sections = text_splitter.split_text(text)
    
    return sections

In [None]:
def get_text_section(limit, text):
    sections_list = []
    length = len(text)
    i = 0

    while i < length - 1:
        j = i + limit

        if j >= length:
            j = length - 1
        elif text[j] not in ('.', '\n', ';'):
            while text[j] not in ('.', '\n', ';'):
                j -= 1
            j += 1

        section = text[i:j]

        if is_valid_section(section):
            sections_list.append(section)
        else: 
            print("INVALID SECTION DETECTED")
            print(section)
            #section_list[-1].extend(section)
        i = j
    
    
    return sections_list

def is_valid_section(section):
    return section and len(section) > 20



In [None]:
def clean_text(input_text):
    # Remove lines with only whitespace
    input_text = re.sub(r'^\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines containing only uppercase text (potential headings)
    input_text = re.sub(r'^\s*[A-Z\s]+\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines with multiple consecutive uppercase words (potential headings)
    input_text = re.sub(r'^\s*(?:[A-Z]+\s*){2,}\s*$', '', input_text, flags=re.MULTILINE)
    
    input_text = re.sub(r'^\s*[A-Za-z\s]+\.{3,}\s*\d+\s*$', '', input_text, flags=re.MULTILINE)

    return input_text

def is_english(line):
    try:
        return detect(line) == 'en'
    except LangDetectException as e:
        print(f"An exception occurred: {e} : {line}")
        return False

# Entities Post-Processing Methods

In [None]:
# merging the broken entities
def create_entities(lst):
    i = 1
    while i < len(lst):
        if lst[i]["word"].startswith('##'):
            lst[i]["word"] = lst[i-1]["word"] + lst[i]["word"][2:]
            lst[i]["score"] = max(lst[i-1]["score"] , lst[i]["score"])
            del lst[i-1]
        else:
            i += 1
            # todo: return a list of merged entities
            


def apply_threshold(list_, threshold):
    words_list = []
    for item in list_:
        if item['score'] > threshold:  # threshold score to eliminate unimportant entities
            words_list.append(item['word'])
    return words_list


In [None]:
def get_raw(list_):
    output = []
    for sublist in list_:
        new = []
        obj = {}
        for item in sublist:
            #obj = {}
            key = ''.join(filter(str.isalpha, item))
            obj[key]= item
            #obj['raw']= ''.join(filter(str.isalpha, item))
        output.append(obj)
    return output

In [None]:
def merge_extracted_entities_old(wiki, bert, gpt):
    
    output = []
    dict_ = gpt
    
    
    #print(dict_.items())
    wiki_set = set(wiki.keys())
    #bert_set = set(bert.keys())
    gpt_set = set(gpt.keys())
    
    
    #A = gpt_set.intersection(bert_set)
    #B = bert_set.intersection(wiki_set)
    C = wiki_set.intersection(gpt_set)
 
    #matched = list(A.union(B).union(C))
    
    
    for i in C:
        output.append(dict_[i])
        
    return output

In [None]:
def merge_extracted_entities(wiki, bert, gpt):
    
    output = set(wiki.values())
    dict_ = gpt
    
    bert_set = set(bert.keys()) - set(wiki.keys())
    gpt_set = set(gpt.keys())
    
    A = gpt_set.intersection(bert_set)

    matched = list(set(A))
    print ("GPT/BERT: " + str(matched))

    for i in matched:
        output.add(dict_[i])
        
    return output

In [None]:
def validate_entities(list_):
    
    # Define a regular expression pattern to match invalid characters.
    pattern = r'\s*{}\s*'.format(re.escape("’"))
    pattern1 = r'\s*{}\s*'.format(re.escape("/"))
    output_list = []

    for item in list_:
        item = re.sub(pattern, "’", item)
        tem = re.sub(pattern1, "/", item)
            
    return output_list



In [None]:
def save_checkpoint(index, wiki, bert, gpt, acronym):
    checkpoint = {'index': index, 'wiki': wiki, 'bert': bert, 'gpt': gpt, 'acronym': acronym}
    with open('checkpoint.pkl', 'wb') as checkpoint_file:
        pickle.dump(checkpoint, checkpoint_file)

# Function to load the state
def load_checkpoint(length):
    try:
        with open('checkpoint.pkl', 'rb') as checkpoint_file:
            checkpoint = pickle.load(checkpoint_file)
            return checkpoint['index'], checkpoint['wiki'], checkpoint['bert'], 
        checkpoint['gpt'], checkpoint['acronym']
    except FileNotFoundError:
        return 0, [''] * length, [''] * length, [''] * length, {}



#  Categorize entities

In [None]:
def categorize_entities(text, entities, categories):
    
    
    categorization_prompt = f"""

    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and >>>>>Categories<<<<<. 
    Your task is to assign a sutiable category to each element of >>>>>EntityList<<<<<.
    Do not add any new entities.
    Return a list of JSON objects of categorized entities. 


    >>>>>TEXT<<<<<
    {text}

    >>>>>Categories<<<<<
    {categories}

    >>>>>EntityList<<<<<
    {entities}
    """

    categorized_entities = get_answer(categorization_prompt, 30)
    categorized_entities = json.loads(categorized_entities)
    
    return (categorized_entities)


# Relation Extraction

In [None]:
def extract_relation_details(text, entities, relation_labels):
    relation_extraction_prompt = f"""
    
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to extract a 
    Knowledge Graph from the UNDP dataset.
    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   
   Your task is to perform Relation Extraction on the given >>>>>TEXT<<<<< 
   to find relations between elements of provided >>>>>EntityList<<<<<.
   
   Please make sure to read these instructions and constraints carefully.

    [Instructions]
    1. Carefully read and store the >>>>>RelationLabels<<<<<.
    2. Scan the >>>>>TEXT<<<<< to find Named Entites from >>>>>EntityList<<<<< that are related.
    3. Scan the >>>>>RelationLabels<<<<< to select a suitable label to
    describe the relation between the above selected entities. Mark this label as "Relation".
    4. Assign "Subject" and "Object" to entities depending on the selected "Relation"
    selected in previous step to create a tuple.
    5. If available, select a small "Description" from the >>>>>TEXT<<<<< for the above relation.
    6. Assign a Relevance score between 1 to 10 to the extracted relation, with 10 being the most relevant.
    7. Repeat the process to extract remaining relations from >>>>>TEXT<<<<<.
    
    
    [Constraints]
    1. Values of 'Relation' key should belong to >>>>>RelationLabels<<<<<.
    
    [Output Format]
    Provide the result as a JSON array.

    Perform relation extraction on the below:
    
    >>>>>TEXT<<<<<
    {text}

    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}
    
"""

    relations = get_answer(relation_extraction_prompt,60)
    relations = json.loads(relations)

    return relations

In [None]:
def extract_ontology_relations(text, entities, ontology):
    relation_extraction_prompt = f"""
    
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to extract a 
    Knowledge Graph from the UNDP dataset.
    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and an >>>>>ONTOLOGY<<<<<.

   [Task]
   
   Your task is to perform Relation Extraction on the given >>>>>TEXT<<<<< 
   to find relations between elements of provided >>>>>EntityList<<<<<. Use the given >>>>>ONTOLOGY<<<<<
   for this purpose.
   
   Please make sure to read these instructions and constraints carefully.

    [Instructions]
    1. Carefully read and understand the >>>>>ONTOLOGY<<<<<.
    2. Scan the >>>>>TEXT<<<<< to find Named Entites in >>>>>EntityList<<<<< that are related.
    3. Read the >>>>>ONTOLOGY<<<<< to select a relationship type for the related entities. Mark this label as "Relation".
    4. Assign "Subject" and "Object" to entities depending on the "Relation"
    selected in previous step to create a tuple.
    5. If available, select a small "Description" from the >>>>>TEXT<<<<< for the above relation.
    6. Assign a Relevance score between 1 to 10 to the extracted relation, with 10 being the most relevant.
    7. Repeat the process to extract remaining relations from >>>>>TEXT<<<<<.
    
    
    [Constraints]
    1. Values of 'Relation' key should be a label from properties in >>>>>ONTOLOGY<<<<<.
    
    [Output Format]
    Provide the result as a JSON array.

    Perform relation extraction on the below:
    
    >>>>>TEXT<<<<<
    {text}

    >>>>>EntityList<<<<<
    {entities}

    >>>>>ONTOLOGY<<<<<
    {ontology}
    
"""

    relations = get_answer(relation_extraction_prompt,60)
    relations = json.loads(relations)

    return relations

In [None]:
with open("ontology.ttl") as f:
    ontology = f.read()
    
    f.close()

# Connecting with DBpedia

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Define the DBpedia SPARQL endpoint
sparql_endpoint = "http://dbpedia.org/sparql"

# Create a SPARQLWrapper instance
sparql = SPARQLWrapper(sparql_endpoint)

# Function to search for an entity by label and return its DBpedia URI
def search_entity(label):
    query = f"""
    SELECT ?entity
    WHERE {{
      ?entity rdfs:label "{label}"@en.
    }}
    LIMIT 1
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()

    if "results" in results and "bindings" in results["results"] and results["results"]["bindings"]:
        entity_uri = results["results"]["bindings"][0]["entity"]["value"]
        return entity_uri
    else:
        return None

# Function to retrieve and return the abstract or comment of an entity
def retrieve_entity_summary(entity_uri):
    # Try to retrieve the abstract
    abstract_query = f"""
    SELECT ?abstract
    WHERE {{
      <{entity_uri}> dbo:abstract ?abstract.
      FILTER (LANGMATCHES(LANG(?abstract), "en"))
    }}
    """

    sparql.setQuery(abstract_query)
    sparql.setReturnFormat(JSON)

    abstract_results = sparql.query().convert()

    if "results" in abstract_results and "bindings" in abstract_results["results"]:
        for result in abstract_results["results"]["bindings"]:
            abstract = result["abstract"]["value"]
            return abstract

    # If abstract is not found, try to retrieve the comment
    comment_query = f"""
    SELECT ?comment
    WHERE {{
      <{entity_uri}> rdfs:comment ?comment.
      FILTER (LANGMATCHES(LANG(?comment), "en"))
    }}
    """

    sparql.setQuery(comment_query)
    sparql.setReturnFormat(JSON)

    comment_results = sparql.query().convert()

    if "results" in comment_results and "bindings" in comment_results["results"]:
        for result in comment_results["results"]["bindings"]:
            comment = result["comment"]["value"]
            return comment

    # If neither abstract nor comment is found, return None
    return None



In [None]:
import urllib.error

def dbpedia_summary(search_label):
    entity_uri = search_entity(search_label)

    if entity_uri:
        print(f"Entity found with DBpedia URI: {entity_uri}")
        try:
            summary = retrieve_entity_summary(entity_uri)
            if summary:
                return summary
            else:
                print("No abstract or comment found for this entity.")
        except urllib.error.URLError as e:
            print(f"Error: {e}")
    else:
        print(f"No entity found with the label: {search_label}")


In [None]:
def extract_summaries(entities):
    
    updated_entities = []

    for item in entities:
        try:
            summary = dbpedia_summary(item['entity'])
            if summary:
                # Only add the summary if it's not None or empty
                item['summary'] = summary
            updated_entities.append(item)  # Add the item regardless of summary presence
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    return updated_entities

# Creating Graph in Neo4j

In [None]:
from py2neo import Graph, Node, Relationship
graph = Graph(uri = 'bolt://localhost:7687',user='neo4j',password=neo4j_pass)

class Document:
    def __init__(self, metadata, entities, relations):
        self.metadata = metadata
        self.entities = entities
        self.relations = relations

# Define a function to create or retrieve a node
def get_or_create_node(label, key, value):
    # Attempt to find an existing node with the given label and key
    existing_node = get_node(label, key, value)
    if not existing_node:
        existing_node = get_node(label, 'acronym', value)
    
    if existing_node:
        return existing_node
    else:
        new_node = Node(label, **{key: value})
        graph.create(new_node)
        return new_node
    
def get_node(label, key, value):
    node = graph.nodes.match(label, **{key:value}).first()
    return node
def get_node_without_label(key, value):
    node = graph.nodes.match(**{key:value}).first()
    return node

# Define a function to insert relations 
def insert_relations_neo4j(entities, relations):
    #document_node = get_or_create_node("Document", "name", document.metadata['Document Title'] )
            
    #for key,value in metadata.items():
     #   document_node[key] = value
        
    #graph.push(document_node)
    
    for item in entities:
        
        node = get_or_create_node('Entity', "name", item["entity"])
        node['category'] = item["category"]
        if "acronym" in item:
            node['acronym'] = item["acronym"]
        if "summary" in item:
            node['summary'] = item["summary"]
        
        graph.push(node)
        #entity_rel = Relationship(node, 'parent_document', document_node)
        
        #if "information" in item:
         #   relation["description"] = item['information']
            
        #graph.create(relation)
        
    for item in relations:
        subject = get_or_create_node( "Entity", "name", item["Subject"])
        obj = get_or_create_node("Entity", "name", item["Object"])
        relation = Relationship(subject, item["Relation"], obj)
        if 'Description' in item:
            relation["description"] = item["Description"]
        
        # Merge nodes and create relationships
        #graph.merge(subject, "name")
        #graph.merge(obj, "name")
        graph.create(relation)
        
        # Link the nodes to the project node
        #graph.create(Relationship(subject, "Belongs To", document_node))
        #graph.create(Relationship(obj, "Belongs To", document_node))
        
    
# Define a function to insert summaries 
def insert_summary_neo4j(data):
    for item in data:
        node = get_node("Entity", "name", item.name)
        node["Summary"] = item.summary
        graph.push(node)
       

# Extraction Pipeline

In [835]:
folder_path = ('Data/NGA')
file_list = os.listdir(folder_path)

# Filter the list to include only text files (e.g., .txt files)
text_files = [file for file in file_list if file.endswith(".txt")]

print (f"Number of files: {len(text_files)}\n")  
print (text_files)

Number of files: 3

['NGA-NGP-2017-EN.txt', 'NGA-CPD-2023-EN.txt', 'NGA-NEPro-2022-EN.txt']


In [836]:
file_path = os.path.join(folder_path, text_files[2])

In [837]:
with open (file_path, 'r') as file:
    head = [next(file) for _ in range(11)]
    next(file)
    raw_text = file.read()
    
    file.close()

print (f"Original text length: {len(raw_text)}")  

Original text length: 8067


In [838]:
 # Open the file in read mode
with open(file_path, 'r') as file:
    
    pattern = re.compile(r'.*?\.{3}.*?$', re.MULTILINE)
    # Initialize an empty string to store the lines
    raw_text = ''
    
    head = [next(file) for _ in range(11)]
    next(file)
    
    # Iterate over each line in the file
    for line in file:
        # Append the current line to the string
        
        if not pattern.search(line) and is_english(line):
            raw_text += line
            
print(f"Read text length: {len(raw_text)}") 

text = clean_text(raw_text)
#text = raw_text

print(f"Cleaned text length: {len(text)}")


An exception occurred: No features in text. : 

An exception occurred: No features in text. : 15%

An exception occurred: No features in text. : 10%

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 75%

An exception occurred: No features in text. : 0% 1%

An exception occurred: No features in text. : 99%

An exception occurred: No features in text. : 55%

An exception occurred: No features in text. : 15%

An exception occurred: No features in text. : 81%

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 20%

An exception occurred: No features in text. : 40%

An exception occurred: No features in text. : 60%

An exception occurred: No features in text. : 80%

An exception occurred: No features in text. : 100%

An exception occurred: No features in text. : 2014 2015 2016 2017 2018 2019 2020

An exception occurred: No features in text. : 53.5

An exception occurred: No features in text. : 57.4 5

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 100%

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 0.0

An exception occurred: No features in text. :  5

An exception occurred: No features in text. :  10

An exception occurred: No features in text. :  15

An exception occurred: No features in text. :  20

An exception occurred: No features in text. :  25

An exception occurred: No features in text. : 2015 2016 2017 2018 2019 2020

An exception occurred: No features in text. : -48%

An exception occurred: No features in text. : 419 439

An exception occurred: No features in text. : 0.0

An exception occurred: No features in text. :  100

An exception occurred: No features in text. :  200

An exception occurred: No features in text. :  300

An exception occurred: No features in text. :  400

An exception occurred: No features in text. :  500

An exception occurred: No features in text. :  6

In [839]:
metadata = {}

# Iterate through the data list
for item in head:
    # Split each element by ':' and strip the resulting strings
    key, value = item.split(':')
    key = key.strip()
    value = value.strip()
    
    # Add the key-value pair to the dictionary
    metadata[key] = value


if 'Exists?' in metadata:
    metadata.pop('Exists?')
print(metadata)


{'File Name': 'NGA-NEPro-2022-EN', 'Year': '2022', 'Country Name': 'Nigeria', 'Country Code': 'NGA', 'Category': 'NEPro', 'Document Title': 'nan', 'Publication Date': '24th August, 2022', 'Start Year': '2014', 'End Year': '2022', 'Language': 'EN'}


In [840]:
text_sections = split_text_spacy(2000, text)

In [841]:
print (f"The number of sections from the text: {len(text_sections)}")

The number of sections from the text: 4


In [842]:
text_length = len(text_sections)

In [843]:
# Load the last checkpoint
#start_index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms = load_checkpoint(text_length)

In [858]:
start_index = 0
wiki_entity_list = [''] * text_length
bert_entity_list = [''] * text_length
gpt_entity_list = [''] * text_length
acronyms = {}

In [859]:
# Continue from the last checkpoint

start_time = time.time()
for index in range(text_length):
    try:
        segment = text_sections[index]
        
        ## WIKINEURAL BILINGUAL MODEL
        wiki_output = query_wiki({
            "inputs": segment,
        })
        create_entities(wiki_output)
        wiki_words = list(set(apply_threshold(wiki_output, 0.7)))
        wiki_entity_list[index] = wiki_words
        print ("WIKI DONE")
        ## BERT BASE MODEL
        bert_output = query_bert({
            "inputs": segment,
        })
        create_entities(bert_output)
        bert_words = list(set(apply_threshold(bert_output, 0.7)))
        bert_entity_list[index] = bert_words
        print ("BERT DONE")


        ## GPT PROMPT
        gpt_output = query_gpt(segment)
        gpt_entity_list[index] = gpt_output['proper_nouns']

        print ("GPT DONE")

        ## Acronyms extraction
        acronyms.update(gpt_output['acronyms'])
    
        
        print(f"NUMBER OF PROCESSED SECTIONS: {index}")

        # Save checkpoint at intervals
        #if index % 5 == 0:
            #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

    except Exception as e:
        print(f"Error processing section {index}: {str(e)}")
        #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

        continue  # Exit the loop in case of an error

end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT ENTITIES from {text_length} section: {elapsed_time}")

WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 0
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 1
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 2
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 3
TIME TAKEN TO EXTRACT ENTITIES from 4 section: 16.769766807556152


In [860]:
print (acronyms)

{'TES': 'Total Energy Supply', 'TJ': 'Terajoules', "USD'000s": 'United States Dollars', 'PPP': 'Purchasing Power Parity', 'GDP': 'Gross Domestic Product', 'WHO': 'World Health Organization', 'GW': 'Gigawatts', 'MW': 'Megawatts', 'TFEC': 'Total Final Energy Consumption', 'RE': 'Renewable Energy', 'CO2': 'Carbon Dioxide', 'kWh': 'Kilowatt-hours', 'NDC': 'Nationally Determined Contribution', 'FDNIS': 'Framework for the implementation of intervention facility for the national gas expansion programme', 'ECOSTAND': 'Minimum Energy Performance Standards', 'NGA': 'Nigeria', 'GWh': 'Gigawatt-hours', 'PV': 'Photovoltaic', 'MWh': 'Megawatt-hours', 'kWp': 'Kilowatt-peak', 'W/m2': 'Watts per square meter', 'NREL': 'National Renewable Energy Laboratory', 'NPP': 'Net primary production', 'IRENA': 'International Renewable Energy Agency', 'UN': 'United Nations', 'SDG': 'Sustainable Development Goal', 'IEA': 'International Energy Agency', 'UNSD': 'United Nations Statistics Division', 'COMTRADE': 'United

In [861]:
i = 0
while i < text_length:
    #print (text_sections[i])
    print (wiki_entity_list[i])
    print (bert_entity_list[i])
    print (gpt_entity_list[i])
    print ("--------")
    i = i+1

['Total Energy Supply']
['T']
['Total Energy Supply', 'Non-renewable', 'Renewable', 'Growth', 'Imports', 'Exports', 'Energy self-sufficiency', 'Coal', 'Renewables', 'Geothermal', 'Access to electricity', 'Access to clean cooking', 'Urban', 'Rural', 'GDP per capita', 'Real GDP growth rate', 'Public flows to renewables', 'Energy intensity', 'Per capita renewable capacity', 'Consumption by sector', 'Industry', 'Households', 'Other', 'Capacity change', 'Capacity utilisation', 'Renewable energy consumption', 'Net capacity change', 'Hydro and marine', 'Renewable capacity', 'Installed capacity trend', 'Electricity', 'Commercial heat', 'Bioenergy', 'Solar direct', 'Fossil fuels', 'Nuclear', 'Other', 'Hydro/marine', 'Wind', 'Solar', 'Avoided emissions', 'Calculated', 'Electricity generation', 'Energy-related CO2 emissions', 'Per capita electricity generation']
--------
['Nationally Determined Contribution', 'Paris Agreement : Nigeria Nigerian Economic Sustainability Plan Framework', 'Minimum En

Processing the Entities 

In [862]:
# Get raw version of entities for comparison
raw_wiki = get_raw(wiki_entity_list)
raw_bert = get_raw(bert_entity_list)
raw_gpt = get_raw(gpt_entity_list)

In [863]:
entity_objects = []
entity_filter = set()
merged = []
i = 0

while i < len(wiki_entity_list):
    merged = merge_extracted_entities_old(raw_wiki[i], raw_bert[i], raw_gpt[i])
    print (f"\nThe number of matching entities in section {i}: {len(merged)}\n")
    print (merged)
    
    print ("\n--------------")

    entity_objects.append(merged)
    entity_filter.update(merged)
    
    i = i+1


The number of matching entities in section 0: 1

['Total Energy Supply']

--------------

The number of matching entities in section 1: 1

['Nigeria']

--------------

The number of matching entities in section 2: 11

['IEA', 'IRENA', 'IRENA Global Atlas', 'WHO', 'United Arab Emirates', 'Global Wind Atlas', 'World Bank Global Solar Atlas', 'Harmonised System', 'World Bank', 'UN World Population Prospects', 'Masdar City']

--------------

The number of matching entities in section 3: 1

['IRENA']

--------------


In [864]:
len(entity_filter)

13

In [865]:
entity_objects

[['Total Energy Supply'],
 ['Nigeria'],
 ['IEA',
  'IRENA',
  'IRENA Global Atlas',
  'WHO',
  'United Arab Emirates',
  'Global Wind Atlas',
  'World Bank Global Solar Atlas',
  'Harmonised System',
  'World Bank',
  'UN World Population Prospects',
  'Masdar City'],
 ['IRENA']]

Chain of Thought - Prompt

In [866]:
# invert acronyms dict to ease look up
acronyms_dict = {v: k for k, v in acronyms.items()}
print (acronyms_dict)

{'Total Energy Supply': 'TES', 'Terajoules': 'TJ', 'United States Dollars': "USD'000s", 'Purchasing Power Parity': 'PPP', 'Gross Domestic Product': 'GDP', 'World Health Organization': 'WHO', 'Gigawatts': 'GW', 'Megawatts': 'MW', 'Total Final Energy Consumption': 'TFEC', 'Renewable Energy': 'RE', 'Carbon Dioxide': 'CO2', 'Kilowatt-hours': 'kWh', 'Nationally Determined Contribution': 'NDC', 'Framework for the implementation of intervention facility for the national gas expansion programme': 'FDNIS', 'Minimum Energy Performance Standards': 'ECOSTAND', 'Nigeria': 'NGA', 'Gigawatt-hours': 'GWh', 'Photovoltaic': 'PV', 'Megawatt-hours': 'MWh', 'Kilowatt-peak': 'kWp', 'Watts per square meter': 'W/m2', 'National Renewable Energy Laboratory': 'NREL', 'Net primary production': 'NPP', 'International Renewable Energy Agency': 'IRENA', 'United Nations': 'UN', 'Sustainable Development Goal': 'SDG', 'International Energy Agency': 'IEA', 'United Nations Statistics Division': 'UNSD', 'United Nations Com

In [867]:
entities_list = []
relations_list = []
entities_with_sections = []
relations_with_section = []
seen_entities = set()
seen_acronyms = set()

In [868]:

start_time = time.time()

for index, uncategorized_entities in enumerate(entity_objects):
    try:
        if len(uncategorized_entities) == 0:
            print ("-- Empty Entities--")
            print ("\n-------------------")

            continue
        entities_subset = categorize_entities(text_sections[index], uncategorized_entities, categories)
        #print(seen_acronyms)

        # Add 'acronym' key to entity list
        for item in entities_subset:
            if item["entity"] not in seen_entities and item["entity"] not in seen_acronyms:
                seen_entities.add(item["entity"])
                
                
                if item["entity"] in acronyms_dict.keys():
                    item["acronym"] = acronyms_dict[item["entity"]]
                    seen_acronyms.add(item['acronym'])
                    #print("SEEN ACRONYM" + str(item))
                    
                elif item['entity'] in acronyms.keys():
                    item["acronym"] = item['entity']
                    item["entity"] = acronyms[item["entity"]]
                    seen_acronyms.add(item['acronym'])
                    #print("SEEN ACRONYM" + str(item))
                    
                entities_list.append(item)

        print ("CATEGORIZED ENTITIES of Section: " + str(index) + "\n")
        print (entities_subset)
        #store the categorized entities in order of lists for later processing
        #entities_with_sections.append(entities_subset)

        relations_subset = extract_ontology_relations(text_sections[index], entity_objects[index], ontology)

        print ("\n EXTRACTED RELATIONS: \n")
        print (relations_subset)

        relations_list.extend(relations_subset)
        relations_with_section.append(relations_subset)

        print ("\n-------------------")

    except Exception as e:
            print(f"Error processing section {index}: {str(e)}")
            #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

    continue  # Exit the loop in case of an error


end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT RELATIONS FROM {text_length} SECTIONS: {elapsed_time}")

CATEGORIZED ENTITIES of Section: 0

[{'entity': 'Total Energy Supply', 'category': 'Product', 'acronym': 'TES'}]

 EXTRACTED RELATIONS: 

[{'Subject': 'Total Energy Supply', 'Relation': 'an instance of', 'Object': 'Total Energy Supply'}]

-------------------
CATEGORIZED ENTITIES of Section: 1

[{'entity': 'Nigeria', 'category': 'Location', 'acronym': 'NGA'}]

 EXTRACTED RELATIONS: 

[{'Subject': 'Nigeria', 'Relation': 'contributes to', 'Object': 'Paris Agreement', 'Description': "Nigeria's Nationally Determined Contribution (NDC) to the Paris Agreement"}, {'Subject': 'Nigeria', 'Relation': 'contributes to', 'Object': 'Economic Sustainability Plan', 'Description': "Nigeria's Nigerian Economic Sustainability Plan"}, {'Subject': 'Nigeria', 'Relation': 'contributes to', 'Object': 'intervention facility for the national gas expansion programme', 'Description': "Nigeria's Framework for the implementation of intervention facility for the national gas expansion programme"}, {'Subject': 'Nigeri

In [869]:
filtered_entities_list = []
for i in entities_list:
    if i['entity'] in entity_filter:
        filtered_entities_list.append(i)
        

In [870]:
print(len(entities_list))
print(len(filtered_entities_list))
print (len(relations_list))


13
10
42


In [871]:
entity_summaries = extract_summaries(filtered_entities_list)

No entity found with the label: Total Energy Supply
Entity found with DBpedia URI: http://dbpedia.org/resource/Category:Nigeria
No abstract or comment found for this entity.
No entity found with the label: IRENA Global Atlas
Entity found with DBpedia URI: http://dbpedia.org/resource/United_Arab_Emirates
Entity found with DBpedia URI: http://dbpedia.org/resource/Global_Wind_Atlas
No entity found with the label: World Bank Global Solar Atlas
Entity found with DBpedia URI: http://dbpedia.org/resource/Harmonised_System
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/Category:World_Bank
No abstract or comment found for this entity.
No entity found with the label: UN World Population Prospects
Entity found with DBpedia URI: http://dbpedia.org/resource/Masdar_City


In [872]:
len(entity_summaries)

10

In [873]:
## create a list of names, to check for valid relations

entity_names = set([item['entity'] for item in filtered_entities_list])
entity_names.update([item['acronym'] for item in filtered_entities_list if 'acronym' in item] )
#entity_names.update(acronyms.values())

In [874]:
print (len(entity_names))

13


In [875]:
entity_dic = {}

for i in entity_summaries:
    entity_dic[i['entity']] = i

In [876]:
final_entities = []
for i in entity_dic.values():
    final_entities.append(i)
    
json_entities = json.dumps(final_entities, indent=2)

In [877]:
with open('Entities/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

In [878]:
print(len(final_entities))


10


In [879]:
with open('Entities/' + metadata['File Name']+ '.json', "r") as f:
    data = f.read()
    ent = json.loads(data)
    f.close()

# Write the relations to files

In [880]:
final_relations = []

for i in relations_list:
    i['Relation'] = i['Relation'].replace(" ", "_")
    if 'Subject' in i and i['Subject'] in entity_names and i['Object'] in entity_names and i['Relation'] in relation_labels:
        final_relations.append(i)
    #elif 'Description' in i and 'Subject' in i and i['Subject'] in entity_dic.keys():
        #entity_dic[i['Subject']].update({'information':i['Description']})
    #elif 'Description' in i and 'Object' in i and i['Object'] in entity_dic.keys():
        #entity_dic[i['Object']].update({'information':i['Description']})
print (len(final_relations))

4


In [881]:
json_relations = json.dumps(final_relations, indent=2)

In [882]:
with open('Relations/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

In [883]:
insert_relations_neo4j(final_entities, final_relations)

# Add Relations to Spreadsheet for Review

In [None]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Use the credentials from the service account key JSON file you downloaded
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name('energy-moonshot-ai-97aa9045e45f.json', scope)
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1yZ-XQQs52kaI5k9MjvV_CdbgWQi-GazjHHGqQUF8gko/edit')


# Enter relations in the first sheet
sheet = spreadsheet.get_worksheet(0)

# Start row index from 5
start_row_index = 5
index = 1

# Check if there's valid data to insert
if final_relations:
    # Create a list of lists where each inner list represents the values of a row
    batch_relations = []
    for index, row_data in enumerate(final_relations):
        row = [index, row_data['Subject'], row_data['Relation'], row_data.get('Object', ''), 
               row_data.get('Description', ''), row_data.get('Relevance', '')]
        
        batch_relations.append(row)
        index = index + 1

    # Insert the data into the Google Sheet starting from row 5
    sheet.insert_rows(batch_relations, start_row_index)

    print(f"{len(final_relations)} entries added to Google Sheet.")
else:
    print("No data to insert.")
    
    
# Enter entities in the second sheet
sheet = spreadsheet.get_worksheet(1)


# Start row index from 5
start_row_index = 5
index = 1

if final_entities:
    batch_entities = []
    for index, row_data in enumerate(final_entities):
        row = [index, row_data['entity'], row_data['category'], row_data.get('acronym', ''), row_data.get('summary', '')]
        batch_entities.append(row)
        
        index = index + 1
    sheet.insert_rows(batch_entities, start_row_index)