<h2> Energy Knowledge Graph Pipeline</h2>

In [1]:
##imports
import pandas as pd

In [None]:
##set globals and filepaths

In [None]:
pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
pip install --upgrade spacy pydantic


<h4>extract entities and relations for all docs </h4>

In [None]:
import spacy
import os
import json
import re
from collections import defaultdict

non_nc = spacy.load('en_core_web_md')
nlp = spacy.load("en_core_web_sm")


SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

def remove_unicode(text):
    return text.encode("ascii", "ignore").decode("utf-8")

def extract_relationship(text, entity1, entity2):
    
    relationship = None
    
    # Find start and end indices of entity1 and entity2
    start_index_entity1 = text.lower().find(entity1.lower())
    end_index_entity1 = start_index_entity1 + len(entity1)
    start_index_entity2 = text.lower().find(entity2.lower())
    end_index_entity2 = start_index_entity2 + len(entity2)
    words_between = ""

    # Check if both entities exist in the text
    if start_index_entity1 != -1 and start_index_entity2 != -1:
        # Entity2 occurs after Entity1
        if start_index_entity2 > end_index_entity1:
            words_between = text[end_index_entity1:start_index_entity2].strip()
        # Entity1 occurs after Entity2
        elif start_index_entity1 > start_index_entity2:
            words_between = text[end_index_entity2:start_index_entity1].strip()
            
        words_doc = nlp(words_between)
        for token in words_doc:
            if token.pos_ == "VERB":
                relationship = token.text
                break

    return relationship if relationship else "None"


def extract_entities_relations(text):
    # Process the text using spaCy
    doc = nlp(text)
    
    # Extract unique entities (unique nouns, organizations, persons, and locations)
    entities = []
    entity_set = set()  # To track unique entities
    for entity in doc.ents:
        if entity.label_ == 'ORG':
            category = 'Organization'
        elif entity.label_ == 'LOC':
            category = 'Location'
        else:
            category = ''
        
        if category and '.' not in entity.text and '\n' not in entity.text:  # Avoid abbreviations and multi-line entities
            entity_text = entity.text.lower()
            if entity_text not in entity_set:  # Check if entity is not already encountered
                entity_set.add(entity_text)
                entities.append({"entity": entity.text, "category": category})


   # Extract relations involving the identified entities
    entity_relations = {entity['entity']: [] for entity in entities}

    # Generate all possible pairs of entities
    entity_pairs = [(entity1['entity'], entity2['entity']) for entity1 in entities for entity2 in entities if entity1['entity'] != entity2['entity']]
    
    # Process the text using spaCy
    doc = nlp(text)

    # Iterate through sentences
    for sent in doc.sents:
        # Check for relationships between pairs of entities in the sentence
        for entity_pair in entity_pairs:
            entity1, entity2 = entity_pair
            if entity1.lower() in sent.text.lower() and entity2.lower() in sent.text.lower():
                # print(f"found sentence=== {sent.text.lower()} for entity {entity1.lower()} {entity2.lower()} \n\n")
                relatn = extract_relationship(remove_unicode(sent.text.lower()),entity1.lower(),entity2.lower() )
                # Check if relation already exists for the entity pair
                if relatn == 'None':
                   h=0
                else: 

                    relation = {
                        "Relation": relatn,
                        # "Subject": entity1.lower(),
                        "Object": entity2.lower(),
                        "Description": sent.text,
                                }
                    if relation not in entity_relations[entity1]:
                        entity_relations[entity1].append(relation)
                    if relation not in entity_relations[entity2]:
                        entity_relations[entity2].append(relation)              

    # Remove entities with no relations
    entities_with_relations = [entity for entity in entities if entity_relations[entity['entity']]]
    
    return entities_with_relations, entity_relations



In [126]:
def clean_json(json_data):
    # Remove duplicates in entities
    entities = json_data["knowledge graph"]["entities"]
    unique_entities = {entity["entity"]: entity for entity in entities}.values()
    json_data["knowledge graph"]["entities"] = list(unique_entities)
    
    # Remove relations with empty array
    relations = json_data["knowledge graph"]["relations"]
    filtered_relations = {
        key: value for key, value in relations.items() if value
    }
    json_data["knowledge graph"]["relations"] = filtered_relations
    
    return json_data


In [None]:
##export results to '02_Output/00_By-Document/' in 00_Entities and 01_Relations

# Replace 'path_to_folder' with the path to your folder containing .txt files


import concurrent.futures

def process_file(file_path):
    with open(file_path, 'r') as file:
        text_content = file.read()

        # Extracting entities and relations
        entities, relations = extract_entities_relations(text_content)
        # Constructing JSON data
        json_data = {
            "metadata": {
                # Add your metadata extraction logic here
            },
            "knowledge graph": {
                "entities": entities,
                "relations": relations
            }
        }
        output_folder_path = '../02_Output/00_By-Document'

        cleanedJSON = clean_json(json_data)
        # Save as JSON
        output_file_name = os.path.basename(file_path).replace('.txt', '.json')
        output_file_path = os.path.join(output_folder_path, output_file_name)
        with open(output_file_path, 'w') as output_file:
            json.dump(cleanedJSON, output_file, indent=4)
        return output_file_path

folder_path = '../01_Input/01_Cleaned-Text'
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".txt")]

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_file, file_paths)

for result in results:
    print(f"Processed file saved at: {result}")




<h4> merge entities and relations across all docs </h4>

In [142]:
#create dictionary of all entities and dictionary of relations, with values for the documents they appear in
#then iterate through each and create merged versions

##export to '02_Output/01_Merged/' as energy-entities.json and energy-entities-datecode.json (and energy-relations)

# Folder path containing JSON files
# folder_path = '../02_Output/00_By-Document'

# # Initialize an empty dictionary to hold merged data
# merged_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": []}}

# # Loop through JSON files in the folder
# for file_name in os.listdir(folder_path):
#     if file_name.endswith(".json"):
#         with open(os.path.join(folder_path, file_name), 'r') as file:
#             data = json.load(file)
            
#             # Merge metadata
#             merged_data['metadata'].update(data['metadata'])
            
#             # Merge entities and relations
#             merged_data['knowledge graph']['entities'].extend(data['knowledge graph']['entities'])
#             merged_data['knowledge graph']['relations'].extend(data['knowledge graph']['relations'])

# # Save merged data into a single JSON file
# output_file = '../02_Output/01_Merged/merged-knowledge-graph.json'
# with open(output_file, 'w') as outfile:
#     json.dump(merged_data, outfile, indent=4)

# print(f"Merged data saved to {output_file}")

def merge_json_files(folder_path):
    merged_entities = []
    merged_relations = {}

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            with open(os.path.join(folder_path, file_name), 'r') as file:
                data = json.load(file)
                entities = data["knowledge graph"]["entities"]
                relations = data["knowledge graph"]["relations"]

                # Merge entities
                merged_entities.extend(entities)

                # Merge relations
                for entity, rel_list in relations.items():
                    if entity in merged_relations:
                        merged_relations[entity].extend(rel_list)
                    else:
                        merged_relations[entity] = rel_list

    # Construct merged JSON data
    merged_json_data = {
        "metadata": {},  # Add metadata as needed
        "knowledge graph": {
            "entities": merged_entities,
            "relations": merged_relations
        }
    }

    return merged_json_data

folder_path = '../02_Output/00_By-Document'
merged_json = merge_json_files(folder_path)

# Save the merged JSON data to a file
output_file_path = '../02_Output/01_Merged/merged-knowledge-graph.json'
with open(output_file_path, 'w') as output_file:
    json.dump(merged_json, output_file, indent=4)



<h4> extract jsons for each entity </h4>

In [None]:
##iterate through keys in entities, and identify all relations for that entity
##generate json including all info about that entity, its relations, all related entitites, and document ids
 

# Folder path containing JSON files

# Folder path containing JSON files
folder_path = '../02_Output/01_Merged'

# Dictionary to hold entity-wise relations
entity_relations = defaultdict(list)

# Loop through JSON files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        with open(os.path.join(folder_path, file_name), 'r') as file:
            data = json.load(file)
            
            entities = data['knowledge graph']['entities']
            relations = data['knowledge graph']['relations']
            
            for entity_info in entities:
                entity = entity_info['entity']
                
                if entity in relations:
                    entity_relations[entity].extend(relations[entity])

# Export to '02_Output/02_By-Entity/'
output_folder_path = '../02_Output/02_By-Entity'
# Create separate JSON files for each entity's relations
for entity, relations in entity_relations.items():
    entity_file = f'{entity}.json'
    entity_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": {entity: relations}}}
    
    with open(os.path.join(output_folder_path, entity_file), 'w') as outfile:
        json.dump(entity_data, outfile, indent=4)

print("Entity files created.")




<h4> create json exports for countries and documents </h4>


In [1]:
##open country and document metadata csv

In [2]:
##use full merged knowledge graph to extract entities and relations