# Hands-on disease module mining and drug prioritization

## Packages

In [1]:
import networkx as nx
import graph_tool as gt
import graph_tool.util as gtu
import graph_tool.topology as gtt
import graph_tool.centrality as gtc
import subprocess
import pandas as pd

## NetworkX vs graph-tool

### NetworkX
- https://networkx.org/
- **Pros:** widely used, tons of functions
- **Con:** mostly Python all the way down and thus slow

### graph-tool
- https://graph-tool.skewed.de/
- **Pro:** C++ backend (partly with OpenMP parallelization) and thus fast
- **Cons:** less widely used, fewer functions

**In this tutorial, we will show you how to use both of them**

## Task 1: Run disease module mining algorithms
-  Run the ROBUST algorithm with default parameters on the HD seeds and the PPI network provided in `data/NeDRex_api/`.
-  Use the `subprocess.run()` functions to run the command line interface.
-  ROBUST requires a header in the PPI network file, please use the PPI network  `filtered_ppi_only_reviewed_proteins_solution_header.csv`
-  Save the resulting disease module to `results/hd_module_robust.csv`.
-  Do the same for DIAMOnD and set the `n` parameter such that the size of the resulting disease module is identical to the disease module computed by ROBUST.
-  For DIAMOnD please use `filtered_ppi_only_reviewed_proteins_solution.csv` as the PPI network file.
-  Save the resulting disease module to `results/hd_module_robust.csv`.

## Solutions to Task 1 

### Run ROBUST

In [3]:
result = subprocess.run(
    [
        'python3', 'robust.py', # Call to command line interface
        '../data/NeDRex_api/seed_genes_huntingtons_disease.csv', # Path to file with newline-separated seeds
        '../Session2_network_medicine_algos/results/hd_module_robust.csv', # Path to output file
        '--network', '../data/NeDRex_api/filtered_ppi_only_reviewed_proteins_solution_header.csv', # Path to file with PPI network edge list
        '--namespace', 'UNIPROT' # Tell the tool that node IDs are UNIPROT IDs
    ],
    cwd='../robust_bias_aware/'
)
print(f'stderr: {result.stderr}')
module_robust = pd.read_csv('results/hd_module_robust.csv')
print(f'Computed module with {len(module_robust[module_robust.terminal])} seeds and {len(module_robust[~module_robust.terminal])} non-seeds')

Setting the graph_diameter to the precomputed value of 8. Directly specify meta to overwrite this.
stderr: None
Computed module with 15 seeds and 59 non-seeds


### Run DIAMOnD (set-up to return module of equal size)

In [4]:
result = subprocess.run(
    [
        'python3', 'DIAMOnD.py', # Call to command line interface
        '../data/NeDRex_api/filtered_ppi_only_reviewed_proteins_solution.csv', # Path to file with PPI network edge list
        '../data/NeDRex_api/seed_genes_huntingtons_disease.csv', # Path to file with newline-separated seeds
        '59', # Number of non-seed nodes in disease module needs to be set explicitly
        '../Session2_network_medicine_algos/results/hd_module_diamond.tsv' # Path to output file
    ],
    cwd='../DIAMOnD/'
)
print(f'stderr: {result.stderr}')
module_diamond = pd.read_csv('results/hd_module_diamond.tsv', sep='\t')

  p-val = \sum_{n=kb}^{k} HypergemetricPDF(n,k,N,s)
  p = float(DIAMOnD_node_info[3])


DIAMOnD(): ignoring 2 of 17 seed genes that are not in the network

 results have been saved to '../Session2_network_medicine_algos/results/hd_module_diamond.tsv' 

stderr: None


## Task 2: Inspect disease modules with NetworkX and graph-tool
- Compute the following summary statistics for the disease modules computed by ROBUST and DIAMOnD:
     - Number of nodes.
     - Number of edges.
     - Density.
     - Number of connected components.
     - Average node degree.
     - Average node betweenness centrality.
- Provide both NetworkX and graph-tool implementations.
- Explain the observed differences between ROBUST and DIAMOnD disease modules.
- From now on please use `filtered_ppi_only_reviewed_proteins_solution.csv` as the PPI network file.
- When using `nx.read_edgelist` please indicate `nodetype=str.strip`.

## Solutions to Task 2

### Load PPI network with NetworkX

In [9]:
ppin_nx = nx.read_edgelist('../data/NeDRex_api/filtered_ppi_only_reviewed_proteins_solution.csv', delimiter=',', encoding="utf-8-sig", nodetype=str.strip)
print(f'Loaded PPI network with {nx.number_of_nodes(ppin_nx)} nodes and {nx.number_of_edges(ppin_nx)} edges')

Loaded PPI network with 12797 nodes and 95916 edges


In [10]:
# Nodes are indexed with strings corresponding to IDs in input file
list(ppin_nx.nodes)[0]

'P15927'

### Load PPI network with graph-tool

In [12]:
ppin_gt = gt.load_graph_from_csv('../data/NeDRex_api/filtered_ppi_only_reviewed_proteins_solution.csv')
print(f'Loaded PPI network with {ppin_gt.num_vertices()} nodes and {ppin_gt.num_edges()} edges')

Loaded PPI network with 12797 nodes and 95916 edges


In [13]:
# Nodes are indexed with integers from 0 to num_nodes - 1
# String node IDs from input file are stored in 'name' vertex property map
ppin_gt.vp['name'][ppin_gt.vertex(0)]

'P15927'

### Project DIAMOnD module on NetworkX and graph-tool graphs

#### Load seeds (not contained in output of DIAMOnD) and kick out seeds not in PPI network

In [15]:
with open('../data/NeDRex_api/seed_genes_huntingtons_disease.csv') as fp:
    seeds = fp.read().splitlines()
seeds = [seed for seed in seeds if ppin_nx.has_node(seed)]

#### Project DIAMOnD and ROBUST modules on NetworkX graph

In [16]:
# DIAMOnD (seeds not contained in module file)
nx.set_node_attributes(ppin_nx, False, name='in_diamond_module')
for node in module_diamond.DIAMOnD_node:
    ppin_nx.nodes[node]['in_diamond_module'] = True
for seed in seeds:
    ppin_nx.nodes[seed]['in_diamond_module'] = True
module_diamond_nx = ppin_nx.subgraph([n for n, d in ppin_nx.nodes(data=True) if d['in_diamond_module']]).copy()

# ROBUST (seeds contained in module file)
nx.set_node_attributes(ppin_nx, False, name='in_robust_module')
for node in module_robust.vertex:
    ppin_nx.nodes[node]['in_robust_module'] = True
module_robust_nx = ppin_nx.subgraph([n for n, d in ppin_nx.nodes(data=True) if d['in_robust_module']]).copy()

#### Project DIAMOnD and ROBUST modules on graph-tool graph

In [17]:
# Lookup table to get node IDs from protein names
node_name_to_id = {ppin_gt.vp['name'][node]: node for node in ppin_gt.vertices()}

# DIAMOnD (seeds not contained in module file)
in_diamond_module = ppin_gt.new_vp('boolean', val=False)
for node in module_diamond.DIAMOnD_node:
    in_diamond_module[node_name_to_id[node]] = True
for seed in seeds:
    in_diamond_module[node_name_to_id[seed]] = True
ppin_gt.vp['in_diamond_module'] = in_diamond_module
module_diamond_gt = gt.GraphView(ppin_gt, vfilt=ppin_gt.vp['in_diamond_module'])

# ROBUST (seeds contained in module file)
in_robust_module = ppin_gt.new_vp('boolean', val=False)
for node in module_robust.vertex:
    in_robust_module[node_name_to_id[node]] = True
ppin_gt.vp['in_robust_module'] = in_robust_module
module_robust_gt = gt.GraphView(ppin_gt, vfilt=ppin_gt.vp['in_robust_module'])

### Functions to compute module statistics with NetworkX and graph-tool

In [18]:
def nx_summary_stats(module: nx.Graph):
    """
    Compute summary and average node-level statistics for a NetworkX subgraph (module).
    """
    n_nodes = module.number_of_nodes()
    n_edges = module.number_of_edges()
    density = nx.density(module)
    n_components = nx.number_connected_components(module)

    # Node-level averages
    avg_degree = sum(dict(module.degree()).values()) / n_nodes if n_nodes > 0 else 0
    betweenness = nx.betweenness_centrality(module) if n_nodes > 0 else {}
    avg_betweenness = sum(betweenness.values()) / n_nodes if n_nodes > 0 else 0

    return {
        "n_nodes": n_nodes,
        "n_edges": n_edges,
        "density": density,
        "n_components": n_components,
        "avg_degree": avg_degree,
        "avg_betweenness": avg_betweenness
    }

In [19]:
def gt_summary_stats(module):
    """
    Compute summary and average node-level statistics for a graph-tool subgraph (module).
    """
    n_nodes = module.num_vertices()
    n_edges = module.num_edges()
    density = (2 * n_edges) / (n_nodes * (n_nodes - 1)) if n_nodes > 1 else 0

    # Connected components
    comp, hist = gtt.label_components(module)
    n_components = len(hist)

    # Node-level averages
    degrees = [v.out_degree() for v in module.vertices()]
    avg_degree = sum(degrees) / n_nodes if n_nodes > 0 else 0

    if n_nodes > 0:
        vb, _ = gtc.betweenness(module)
        avg_betweenness = sum(vb[v] for v in module.vertices()) / n_nodes
    else:
        avg_betweenness = 0

    return {
        "n_nodes": n_nodes,
        "n_edges": n_edges,
        "density": density,
        "n_components": n_components,
        "avg_degree": avg_degree,
        "avg_betweenness": avg_betweenness,
    }

### Compute summary statistics for ROBUST and DIAMOnD modules with graph-tool and NetworkX

In [20]:
nx_summary_stats(module_robust_nx)

{'n_nodes': 74,
 'n_edges': 136,
 'density': 0.050351721584598295,
 'n_components': 1,
 'avg_degree': 3.675675675675676,
 'avg_betweenness': 0.03765066436299313}

In [21]:
nx_summary_stats(module_diamond_nx)

{'n_nodes': 74,
 'n_edges': 355,
 'density': 0.1314328026656794,
 'n_components': 11,
 'avg_degree': 9.594594594594595,
 'avg_betweenness': 0.010109424492986135}

In [22]:
gt_summary_stats(module_robust_gt)

{'n_nodes': 74,
 'n_edges': 136,
 'density': 0.050351721584598295,
 'n_components': 1,
 'avg_degree': 3.675675675675676,
 'avg_betweenness': 0.03765066436299313}

In [23]:
gt_summary_stats(module_diamond_gt)

{'n_nodes': 74,
 'n_edges': 355,
 'density': 0.1314328026656794,
 'n_components': 11,
 'avg_degree': 9.594594594594595,
 'avg_betweenness': 0.010109424492986135}

## Task 3: Prioritize drugs based on DIAMOnD and ROBUST modules

- Load the drug-protein interaction data from `data/NeDrex_api/pdi_solutions.csv` and map it to the PPI network.
- Ensure to add only those drug-protein interactions where the involved protein is contained in the PPI network.
- Run TrustRank (a.k.a. personalized PageRank) on the integrated protein-protein-drug interaction network to prioritize drugs targeting the ROBUST and DIAMOnD disease modules.
- Display the top 10 ranked drugs for both the ROBUST and the DIAMOnD module.
- Provide both NetworkX and graph-tool implementations. Measure runtimes of both implementations.

## Solution to Task 3

### Preparation: load PDI data into pandas data frame

In [25]:
pdi = pd.read_csv('../data/NeDRex_api/pdi_solution.csv', header=None)
new_edges = [(pdi.loc[i,0],pdi.loc[i,1]) for i in range(pdi.shape[0]) if ppin_nx.has_node(pdi.loc[i,1])]
new_nodes = set(pdi.loc[i,0] for i in range(pdi.shape[0]) if ppin_nx.has_node(pdi.loc[i,1]))

### NetworkX implementation

#### Map drug-protein interactions on PPI network

In [26]:
ppi_pdi_nx = ppin_nx.copy()
nx.set_node_attributes(ppi_pdi_nx, 'protein', name='node_type')
ppi_pdi_nx.add_nodes_from(new_nodes, node_type='drug', in_robust_module=False, in_diamond_module=False)
ppi_pdi_nx.add_edges_from(new_edges)

#### Run personalized PageRank for ROBUST and DIAMOnD modules

In [27]:
%%timeit
personalization_robust = {n: 1.0 if d['in_robust_module'] else 0 for n, d in ppi_pdi_nx.nodes(data=True)}
pageranks_robust = nx.pagerank(ppi_pdi_nx, personalization=personalization_robust)
ranked_drugs_robust = sorted([(n, pageranks_robust[n]) for n, d in ppi_pdi_nx.nodes(data=True) if d['node_type'] == 'drug'], 
                             key = lambda t: t[1], reverse=True)

236 ms ± 8.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit
personalization_diamond = {n: 1.0 if d['in_diamond_module'] else 0 for n, d in ppi_pdi_nx.nodes(data=True)}
pageranks_diamond = nx.pagerank(ppi_pdi_nx, personalization=personalization_diamond)
ranked_drugs_diamond = sorted([(n, pageranks_diamond[n]) for n, d in ppi_pdi_nx.nodes(data=True) if d['node_type'] == 'drug'], 
                             key = lambda t: t[1], reverse=True)

258 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Display top 10 drugs

In [30]:
personalization_robust = {n: 1.0 if d['in_robust_module'] else 0 for n, d in ppi_pdi_nx.nodes(data=True)}
pageranks_robust = nx.pagerank(ppi_pdi_nx, personalization=personalization_robust)
ranked_drugs_robust = sorted([(n, pageranks_robust[n]) for n, d in ppi_pdi_nx.nodes(data=True) if d['node_type'] == 'drug'], 
                             key = lambda t: t[1], reverse=True)
ranked_drugs_robust[:10]

[('DB04216', 0.0007367139324149454),
 ('DB00470', 0.0005343282473511509),
 ('DB12010', 0.0004966453583176836),
 ('DB07159', 0.0004966453583176836),
 ('DB01268', 0.0004656184358213586),
 ('DB12500', 0.0004652126555326274),
 ('DB06595', 0.00044228393563049866),
 ('DB00675', 0.00042260207508462423),
 ('DB13245', 0.0004055912971352364),
 ('DB04224', 0.0003897511494595751)]

In [31]:
personalization_diamond = {n: 1.0 if d['in_diamond_module'] else 0 for n, d in ppi_pdi_nx.nodes(data=True)}
pageranks_diamond = nx.pagerank(ppi_pdi_nx, personalization=personalization_diamond)
ranked_drugs_diamond = sorted([(n, pageranks_diamond[n]) for n, d in ppi_pdi_nx.nodes(data=True) if d['node_type'] == 'drug'], 
                             key = lambda t: t[1], reverse=True)
ranked_drugs_diamond[:10]

[('DB00331', 0.0011233926601213102),
 ('DB04216', 0.0006979613324682927),
 ('DB12141', 0.0005517161758927912),
 ('DB00470', 0.0005220970274248953),
 ('DB12010', 0.0004724130687235943),
 ('DB07159', 0.0004724130687235943),
 ('DB12500', 0.0004330558071265671),
 ('DB01268', 0.00043204161126603527),
 ('DB13245', 0.0004003914779791134),
 ('DB04224', 0.0003887322803207069)]

### graph-tool implementation

#### Map drug-protein interactions on PPI network

In [32]:
ppi_pdi_gt = ppin_gt.copy()
ppi_pdi_gt.vp['node_type'] = ppi_pdi_gt.new_vp('string', val='protein')

In [33]:
for drug in new_nodes:
    node = ppi_pdi_gt.add_vertex()
    ppi_pdi_gt.vp['name'][node] = drug
    ppi_pdi_gt.vp['node_type'][node] = 'drug'
    node_name_to_id[drug] = node
ppi_pdi_gt.add_edge_list([(node_name_to_id[p], node_name_to_id[d]) for p, d in new_edges])

#### Run personalized PageRank for ROBUST and DIAMOnD modules

In [34]:
%%timeit 
pagerank_robust_gt = gtc.pagerank(ppi_pdi_gt, pers=ppi_pdi_gt.vp['in_robust_module'])
ranked_drugs_robust_gt = sorted([(ppi_pdi_gt.vp['name'][i], pagerank_robust_gt[i]) for i in ppi_pdi_gt.vertices() if ppi_pdi_gt.vp['node_type'][i] == 'drug'], 
                                key = lambda t: t[1], reverse=True)

97.4 ms ± 3.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
%%timeit
pagerank_diamond_gt = gtc.pagerank(ppi_pdi_gt, pers=ppi_pdi_gt.vp['in_diamond_module'])
ranked_drugs_diamond_gt = sorted([(ppi_pdi_gt.vp['name'][i], pagerank_diamond_gt[i]) for i in ppi_pdi_gt.vertices() if ppi_pdi_gt.vp['node_type'][i] == 'drug'], 
                                key = lambda t: t[1], reverse=True)

97.4 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Display top 10 drugs

In [36]:
pagerank_robust_gt = gtc.pagerank(ppi_pdi_gt, pers=ppi_pdi_gt.vp['in_robust_module'])
ranked_drugs_robust_gt = sorted([(ppi_pdi_gt.vp['name'][i], pagerank_robust_gt[i]) for i in ppi_pdi_gt.vertices() if ppi_pdi_gt.vp['node_type'][i] == 'drug'], 
                                key = lambda t: t[1], reverse=True)
ranked_drugs_robust_gt[:10]

[('DB04216', 0.05331176498058417),
 ('DB00470', 0.03933283094432876),
 ('DB12010', 0.03532142252000088),
 ('DB07159', 0.03532142252000088),
 ('DB12500', 0.03296728451957242),
 ('DB01268', 0.03296352208226907),
 ('DB06595', 0.03163095333090963),
 ('DB00675', 0.030237125140718142),
 ('DB13245', 0.029875626046104657),
 ('DB04224', 0.028681346707987266)]

In [37]:
pagerank_diamond_gt = gtc.pagerank(ppi_pdi_gt, pers=ppi_pdi_gt.vp['in_diamond_module'])
ranked_drugs_diamond_gt = sorted([(ppi_pdi_gt.vp['name'][i], pagerank_diamond_gt[i]) for i in ppi_pdi_gt.vertices() if ppi_pdi_gt.vp['node_type'][i] == 'drug'], 
                                key = lambda t: t[1], reverse=True)
ranked_drugs_diamond_gt[:10]

[('DB00331', 0.08394478643897961),
 ('DB04216', 0.050357820748415966),
 ('DB12141', 0.040462962252173426),
 ('DB00470', 0.03838638455210808),
 ('DB12010', 0.03367805736361676),
 ('DB07159', 0.03367805736361676),
 ('DB12500', 0.030737825370089036),
 ('DB01268', 0.030608772968995293),
 ('DB13245', 0.029504331703495642),
 ('DB04224', 0.028590544679873566)]