In [1]:
import networkx as nx
import warnings
from itertools import islice
# import tqdm
# import matplotlib.pyplot as plt
# import pygraphviz as pgv
# from networkx.drawing.nx_agraph import graphviz_layout
from elasticsearch import Elasticsearch
from elasticsearch_dsl import connections

warnings.filterwarnings('ignore')


---

### `generate_netowrkx_graph()` 

`DiGraph`: holds directed edges. Self loops are allowed but multiple (parallel) edges are not.  
`MultiDiGraph`: a directed graph class that can store multiedges.
Multiedges are multiple edges between two nodes. Each edge can hold optional data or attributes.
A MultiDiGraph holds directed edges. Self loops are allowed.

Methods -

In [2]:
def get_all_via_scan(index=None, size=10000, query_data=None, scroll='1m'):
    from elasticsearch.helpers import scan
    from elasticsearch_dsl import connections

    es = connections.get_connection()

    if not index:
        index = "smartapi_metakg_docs_consolidated"

    # Make the initial scan request
    response = scan(es, query=query_data, index=index, size=size, scroll=scroll)

    for hit in response:
        yield hit


def generate_networkx_graph(index, graph_type="simple_digraph", edges=False):
    # # Create a new directed graph
    if graph_type == "simple_digraph":
        G = nx.DiGraph()
    elif graph_type == "multi_digraph":
        G = nx.MultiDiGraph()

    # Scroll through search results
    for hit in get_all_via_scan(index):
        # Extract subject, object, and predicate from hit
        _subject = hit['_source']['subject']
        _object = hit['_source']['object']
        _predicate = hit['_source']['predicate']
        
        # Add edge to graph, with predicate as an attribute
        if edges is True:
            G.add_edge(_subject, _object, predicate=_predicate)
        else:
            G.add_edge(_subject, _object)

    return G

def graph_inspection(G):
    # Get and print the number of nodes and edges
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    graph_type = type(G)
    
    print(f"Graph type: {graph_type}")
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}\n")

    # Define the slice indices
    start_index = 55
    end_index = 60

    # Print a slice of nodes
    print("Nodes:")
    for node in islice(G.nodes(), start_index, end_index):
        print(node)

    # Print a slice of edges and their attributes
    print("\nEdges:")
    edges = list(G.edges(data=True))
    for source, target, edge_data in islice(edges, start_index, end_index):
        predicate = edge_data.get("predicate", "No predicate")
        print(f"Edge: {source} -> {target}, Predicate={predicate}")

Initialize Index -

In [3]:
es = Elasticsearch()
index = "smartapi_metakg_docs_consolidated"
connections.create_connection(hosts=['localhost'])


<Elasticsearch([{'host': 'localhost'}])>

Basic `digraph`, nodes only

In [75]:
G = generate_networkx_graph(index)
graph_inspection(G)

Graph type: <class 'networkx.classes.digraph.DiGraph'>
Number of nodes: 113
Number of edges: 6317

Nodes:
PathologicalProcess
ActivityAndBehavior
InformationContentEntity
DrugExposure
ChemicalExposure

Edges:
Edge: ChemicalEntity -> Procedure, Predicate=No predicate
Edge: ChemicalEntity -> PhysiologicalProcess, Predicate=No predicate
Edge: ChemicalEntity -> InformationContentEntity, Predicate=No predicate
Edge: ChemicalEntity -> Phenomenon, Predicate=No predicate
Edge: ChemicalEntity -> PopulationOfIndividualOrganisms, Predicate=No predicate


Basic `digraph`, with edges

In [76]:
G_edges_ = generate_networkx_graph(index, "simple_digraph", True)
graph_inspection(G_edges_)

Graph type: <class 'networkx.classes.digraph.DiGraph'>
Number of nodes: 113
Number of edges: 6317

Nodes:
PathologicalProcess
ActivityAndBehavior
InformationContentEntity
DrugExposure
ChemicalExposure

Edges:
Edge: ChemicalEntity -> Procedure, Predicate=same_as
Edge: ChemicalEntity -> PhysiologicalProcess, Predicate=produces
Edge: ChemicalEntity -> InformationContentEntity, Predicate=close_match
Edge: ChemicalEntity -> Phenomenon, Predicate=produces
Edge: ChemicalEntity -> PopulationOfIndividualOrganisms, Predicate=associated_with


`multi-digraph`, with edges

In [77]:
G_edges = generate_networkx_graph(index, "multi_digraph", True)
graph_inspection(G_edges)

Graph type: <class 'networkx.classes.multidigraph.MultiDiGraph'>
Number of nodes: 113
Number of edges: 167827

Nodes:
PathologicalProcess
ActivityAndBehavior
InformationContentEntity
DrugExposure
ChemicalExposure

Edges:
Edge: ChemicalEntity -> ThingWithTaxon, Predicate=actively_involved_in
Edge: ChemicalEntity -> ThingWithTaxon, Predicate=actively_involves
Edge: ChemicalEntity -> ThingWithTaxon, Predicate=acts_upstream_of_negative_effect
Edge: ChemicalEntity -> ThingWithTaxon, Predicate=acts_upstream_of_or_within_negative_effect
Edge: ChemicalEntity -> ThingWithTaxon, Predicate=acts_upstream_of_or_within_positive_effect


---

### `all_simple_paths(G, source, target, cutoff=None)`  
Generate all simple paths in the graph G from source to target. A simple path is a path with no repeated nodes.  
  
    
#### `has_path(G, source, target)`
Returns True if G has a path from source to target.

In [73]:
# Define the function to find all simple paths and check if a path exsists
def return_all_simple_paths(G, _source, _target, _cutoff=2):
    # check for existence b/w paths -- recommended to use for large graphs
    if nx.has_path(G, _source, _target):
        return list(nx.all_simple_paths(G, source=_source, target=_target, cutoff=_cutoff))
    else:
        return []

def display_pathway_results(path_list, start_index, end_index, edges=False):
    # Iterate through a portion of the edges and view their attributes
    portion_of_paths = islice(path_list, start_index, end_index)
    
    # Iterate through the edges and view their attributes
    print(f"[INFO] Pathways for source:{_source} & target:{_target} with cutoff:{_cutoff}")
    print(f"[INFO] Total pathways extracted - {len(path_list)}\n")

    for path in portion_of_paths:
        print(f"Path:", " -> ".join(path), "\n")

        if edges:
            # Iterate through the edges in the path
            for i in range(len(path) - 1):
                source_node = path[i]
                target_node = path[i + 1]
                
                # Get edge attributes for the edge between source and target nodes
                edge_data = G_edges.get_edge_data(source_node, target_node)
                # if edge_data:
                #     for edge_key, edge_info in edge_data.items():
                #         predicate = edge_info.get("predicate", "No predicate")
                #         print(f"Edge from {source_node} to {target_node} (Key: {edge_key}): Predicate={predicate}")
                # else:
                #     print(f"No edge between {source_node} and {target_node}")

                # Extract 'predicate' values using a list comprehension
                predicate_values = [data['predicate'] for data in edge_data.values()]
                # Print the list of 'predicate' values
                print(f'* {source_node} -> {target_node} * \nEdges: {", ".join(predicate_values)}')

            print()
            print("-"* 80)

Set resusable testing variables

In [58]:
 # End index (exclusive)
_source='ChemicalEntity'
_target='PlanetaryEntity'
_cutoff=3

# Define the range of edges you want to iterate through
start_index = 0  # Start index
end_index = 10 #int(G_edges.number_of_edges()/1000)  

In [59]:
all_simple_paths_G = return_all_simple_paths(G, _source, _target, _cutoff)
display_pathway_results(all_simple_paths_G, 0, 5)

[INFO] Pathways for source:ChemicalEntity & target:PlanetaryEntity with cutoff:3
[INFO] Total pathways extracted - 4504

Path: ChemicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> ThingWithTaxon -> Protein -> PlanetaryEntity
Path: ChemicalEntity -> ThingWithTaxon -> Cell -> PlanetaryEntity
Path: ChemicalEntity -> ThingWithTaxon -> AnatomicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> ThingWithTaxon -> NamedThing -> PlanetaryEntity


In [60]:
all_simple_paths_G_edges = return_all_simple_paths(G_edges, _source, _target)
display_pathway_results(all_simple_paths_G_edges, 0, 5)

[INFO] Pathways for source:ChemicalEntity & target:PlanetaryEntity with cutoff:3
[INFO] Total pathways extracted - 132659

Path: ChemicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> PlanetaryEntity
Path: ChemicalEntity -> PlanetaryEntity


In [74]:
all_simple_paths_G_edges_ = return_all_simple_paths(G_edges_, _source, _target)
display_pathway_results(all_simple_paths_G_edges_, 0, 5, edges=True)

[INFO] Pathways for source:ChemicalEntity & target:PlanetaryEntity with cutoff:3
[INFO] Total pathways extracted - 61

Path: ChemicalEntity -> PlanetaryEntity 

* ChemicalEntity -> PlanetaryEntity * 
Edges: negatively_regulates, participates_in, positively_regulates, actively_involved_in, actively_involves, acts_upstream_of_negative_effect, acts_upstream_of_or_within_negative_effect, acts_upstream_of_or_within, acts_upstream_of_or_within_positive_effect, acts_upstream_of, acts_upstream_of_positive_effect, affects, capable_of, enables, regulates, related_to, process_negatively_regulates_process, has_active_ingredient, primarily_composed_of, temporally_related_to, associated_with, process_regulates_process, similar_to, catalyzes, overlaps, affects_response_to, process_positively_regulates_process, composed_primarily_of, has_upstream_actor, exacerbates, increases_response_to, disrupts, is_output_of, affected_by, treats, is_substrate_of, part_of, has_capability, preceded_by, has_participan

In [39]:
# for path in all_simple_paths_G_edges_:
#     print(path)
edge_data = G_edges.get_edge_data(_source, _target)
print(edge_data)

# Extract 'predicate' values using a list comprehension
predicate_values = [data['predicate'] for data in edge_data.values()]
print()
# Print the list of 'predicate' values
print(predicate_values)

{0: {'predicate': 'negatively_regulates'}, 1: {'predicate': 'participates_in'}, 2: {'predicate': 'positively_regulates'}, 3: {'predicate': 'actively_involved_in'}, 4: {'predicate': 'actively_involves'}, 5: {'predicate': 'acts_upstream_of_negative_effect'}, 6: {'predicate': 'acts_upstream_of_or_within_negative_effect'}, 7: {'predicate': 'acts_upstream_of_or_within'}, 8: {'predicate': 'acts_upstream_of_or_within_positive_effect'}, 9: {'predicate': 'acts_upstream_of'}, 10: {'predicate': 'acts_upstream_of_positive_effect'}, 11: {'predicate': 'affects'}, 12: {'predicate': 'capable_of'}, 13: {'predicate': 'enables'}, 14: {'predicate': 'regulates'}, 15: {'predicate': 'related_to'}, 16: {'predicate': 'process_negatively_regulates_process'}, 17: {'predicate': 'has_active_ingredient'}, 18: {'predicate': 'primarily_composed_of'}, 19: {'predicate': 'temporally_related_to'}, 20: {'predicate': 'associated_with'}, 21: {'predicate': 'process_regulates_process'}, 22: {'predicate': 'similar_to'}, 23: {'

---