<h2> Energy Knowledge Graph Pipeline</h2>

In [1]:
##imports
import pandas as pd

In [None]:
##set globals and filepaths

In [None]:
pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
pip install --upgrade spacy pydantic


<h4>extract entities and relations for all docs </h4>

In [None]:
import spacy
import os
import json
import re
from collections import defaultdict

non_nc = spacy.load('en_core_web_md')
nlp = spacy.load("en_core_web_sm")


SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']


# You might need libraries like spaCy or NLTK for NLP tasks

# Function to extract entities and relations (replace with your extraction logic)
def extract_entities_relations(text):
    # Process the text using spaCy
    doc = nlp(text)
    
    # Extract entities
    # Extract entities (unique nouns, organizations, persons, and locations)
    entity_set = set()  # Set to track unique entities
    entities = []
    for entity in doc.ents:
        if entity.label_ == 'ORG':
            category = 'Organization'
        elif entity.label_ == 'LOC':
            category = 'Location'
        else:
            category = ''
        
        if category and '.' not in entity.text and '\n' not in entity.text:  # Avoid abbreviations and multi-line entities
            if entity.text not in entity_set:  # Check if entity is not already encountered
                entity_set.add(entity.text)
                entities.append({"entity": entity.text, "category": category})
   
    # Extract relations (based on simple pattern matching)

    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in VERBS and token.pos_ == 'VERB':
                subj = [w for w in token.children if w.dep_ in SUBJECTS and w.text.lower() not in ['it', 'they', 'he', 'she']]
                obj = [w for w in token.children if w.dep_ in OBJECTS]
                if subj and obj:
                    relations.append({
                        "Relation": token.lemma_,
                        "Subject": subj[0].text,
                        "Object": obj[0].text,
                        "Description": sent.text,
                        # "Relevance": 8  # Placeholder relevance score
                    })
    
    
    return entities, relations
    

 

In [7]:
##export results to '02_Output/00_By-Document/' in 00_Entities and 01_Relations

# Replace 'path_to_folder' with the path to your folder containing .txt files
folder_path = '../01_Input/01_Cleaned-Text'

for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        with open(os.path.join(folder_path, file_name), 'r') as file:
            text_content = file.read()

            # Extracting entities and relations
            entities, relations = extract_entities_relations(text_content)

            # Constructing JSON data
            json_data = {
                "metadata": {
                    # Add your metadata extraction logic here
                },
                "knowledge graph": {
                    "entities": entities,
                    "relations": relations
                }
            }

            output_folder_path = '../02_Output/00_By-Document'

            # Save as JSON
            output_file_name = file_name.replace('.txt', '.json')
            with open(os.path.join(output_folder_path, output_file_name), 'w') as output_file:
                json.dump(json_data, output_file, indent=4)


<h4> merge entities and relations across all docs </h4>

In [8]:
#create dictionary of all entities and dictionary of relations, with values for the documents they appear in
#then iterate through each and create merged versions

##export to '02_Output/01_Merged/' as energy-entities.json and energy-entities-datecode.json (and energy-relations)

# Folder path containing JSON files
folder_path = output_folder_path

# Initialize an empty dictionary to hold merged data
merged_data = {"metadata": {}, "knowledge graph": {"entities": [], "relations": []}}

# Loop through JSON files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        with open(os.path.join(folder_path, file_name), 'r') as file:
            data = json.load(file)
            
            # Merge metadata
            merged_data['metadata'].update(data['metadata'])
            
            # Merge entities and relations
            merged_data['knowledge graph']['entities'].extend(data['knowledge graph']['entities'])
            merged_data['knowledge graph']['relations'].extend(data['knowledge graph']['relations'])

# Save merged data into a single JSON file
output_file = '../02_Output/01_Merged/merged-knowledge-graph.json'
with open(output_file, 'w') as outfile:
    json.dump(merged_data, outfile, indent=4)

print(f"Merged data saved to {output_file}")


Merged data saved to ../02_Output/01_Merged/merged-knowledge-graph.json


<h4> extract jsons for each entity </h4>

In [None]:
##iterate through keys in entities, and identify all relations for that entity
##generate json including all info about that entity, its relations, all related entitites, and document ids
 

# Folder path containing JSON files
folder_path = output_folder_path

# Dictionary to hold entity-wise relations
entity_relations = defaultdict(list)

# Loop through JSON files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        with open(os.path.join(folder_path, file_name), 'r') as file:
            data = json.load(file)
            
            entities = data['knowledge graph']['entities']
            relations = data['knowledge graph']['relations']
            
            for entity_info in entities:
                entity = entity_info['entity']
                
                for relation in relations:
                    if (entity in relation['Subject'] or
                        entity in relation['Object'] or
                        entity in relation['Description']):
                        
                        entity_relations[entity].append(relation)




In [None]:
##export to '02_Output/02_By-Entity/'
output_folder_path = '../02_Output/02_By-Entity'
# Create separate JSON files for each entity's relations
for entity, relations in entity_relations.items():
    entity_file = f'{entity}.json'
    entity_data = {"metadata": {}, "knowledge graph": {"entity": entity, "relations": relations}}
    
    with open(os.path.join(output_folder_path, entity_file), 'w') as outfile:
        json.dump(entity_data, outfile, indent=4)

print("Entity files created.")


<h4> create json exports for countries and documents </h4>


In [1]:
##open country and document metadata csv

In [2]:
##use full merged knowledge graph to extract entities and relations