In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import cdt
import os

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.


Before comparing graphs, a common algorithm for learning graphs needs to be selected. There can be two ways of doing this - 

1. Contstructing a gene-gene interaction network using KEGG, TRRUST and CTD databases and compare that with the graphs constructed for healthy controls using different algorithms. The algorithm used to learn the graph that in a sense is closest to the CTD graph should be selected

2. Pick the algorithm that makes the most sense or has is the most meaningful in terms of the kind of data for which graphs are being learnt for.

In this script, the focus is on the graphs generated using the PC algorithm, selected using point 2

In [2]:
path = os.getcwd() + '/causal graphs/'
xls = pd.ExcelFile(path + 'PC_graphs.xlsx')

pc_graphs = {}
for sheet in xls.sheet_names:
    adjacency_df = pd.read_excel(path+'PC_graphs.xlsx', sheet_name = sheet)
    adjacency_df.rename(columns = {'Unnamed: 0':'genes'}, inplace = True)
    adjacency_df.index = adjacency_df.iloc[:,0]
    adjacency_df.drop(columns = ['genes'], inplace = True)
    pc_graphs[sheet] = nx.from_pandas_adjacency(adjacency_df, create_using=nx.DiGraph())

## Structural Hamming distance

In this section, structural hamming distance is calculated for all possible pairs of diseases with the emphasis on distance from "healthy" control network

In [3]:
pc_graphs.keys()

dict_keys(['Adenovirus_Simplex_virus', 'Dengue', 'Influenza', 'Paraflu_RespSyncytial', 'Pneumonia', 'Rhinovirus', 'healthy_ctrl', 'Critical', 'Non-critical'])

In [13]:
from cdt.metrics import SHD
from cdt.metrics import SID

hamming_df = pd.DataFrame(columns = list(pc_graphs.keys()), index = list(pc_graphs.keys()))
intervention_df = pd.DataFrame(columns = list(pc_graphs.keys()), index = list(pc_graphs.keys()))

for col in hamming_df.columns.to_list():
    for idx in hamming_df.index.to_list():
        hamming_df.loc[idx, col] = SHD(pc_graphs[idx], pc_graphs[col], \
                                       double_for_anticausal = True)
        intervention_df.loc[idx, col] = SID(pc_graphs[idx], pc_graphs[col])

In [16]:
hamming_df.to_csv(os.getcwd()+'/results/structural_hamming_distance_rowsource.csv')
intervention_df.to_csv(os.getcwd()+'/results/structural_intervention_distance_rowsource.csv')

## Maximum common subnetwork

This section is focused on calculating the largest common subnetwork across all the graphs learnt especially with the graph for healthy controls

In [73]:
def find_common_subnetwork(graph1, graph2):
    if not isinstance(graph1, nx.DiGraph) or not isinstance(graph2, nx.DiGraph):
        print('Function works only for directed graphs')
        return
    
    common_graph = nx.DiGraph()
    for u,v in graph1.edges:
        if graph2.has_edge(u,v):
            common_graph.add_edge(u,v)
    # removing nodes with degree 0
    remove_nodes = [node for node, degree in dict(common_graph.degree).items() if degree == 0]
    common_graph.remove_nodes_from(remove_nodes)
    
    return common_graph

In [89]:
# finding common subnetworks between all possible pairs of conditions

subnetworks = {}
for c1 in pc_graphs.keys():
    subnetworks[c1] = {}
    writer = pd.ExcelWriter(os.getcwd() +'/results/subnetworks/common_with_'+str(c1)+'.xlsx', \
                            engine = 'xlsxwriter')
    for c2 in pc_graphs.keys():
        if c1 == c2:
            continue
        else:
            common_graph = find_common_subnetwork(pc_graphs[c1], pc_graphs[c2])
            subnetworks[c1][c2] = common_graph
            nx.to_pandas_adjacency(common_graph).to_excel(writer, sheet_name = c2)
    writer.save()

In [93]:
# finding common subnetworks for combination of 3 conditions

# Pneumonia + Influenza + Critical
common_PneuInfCrit = find_common_subnetwork(subnetworks['Pneumonia']['Influenza'], pc_graphs['Critical'])

# Pneumonia + Influenza + Non-critical
common_PneuInfNonCrit = find_common_subnetwork(subnetworks['Pneumonia']['Influenza'], pc_graphs['Non-critical'])

# Rhinovirus + Influenza + Critical
common_RhinoInfCrit = find_common_subnetwork(subnetworks['Rhinovirus']['Influenza'], pc_graphs['Critical'])

# Rhinovirus + Influenza + Non-critical
common_RhinoInfNonCrit = find_common_subnetwork(subnetworks['Rhinovirus']['Influenza'], pc_graphs['Non-critical'])

In [94]:
# comparing/quantifying common subnetworks

## Strongly connected components

In this section, largest strongly connected components for each graph corresponding a condition is calculated and evaluated. The evaluation is done in the last section

In [99]:
largest = max(nx.strongly_connected_components(pc_graphs['healthy_ctrl']), key = len)

In [108]:
writer = pd.ExcelWriter(os.getcwd()+'/results/subnetworks/strongly_connected_components.xlsx', \
                        engine = 'xlsxwriter')
conn_comp = {}
for c in pc_graphs.keys():
    l = list(max(nx.strongly_connected_components(pc_graphs[c]), key = len))
    pd.Series(l).to_excel(writer, sheet_name = c, index = False, header = ['genes'])
    conn_comp[c] = l
writer.save()

## Markov blanket of COVID19 drug targets + driver genes

In [128]:
for c in pc_graphs.keys():
    flag = 1
    if len([x for x in driver_genes if x in pc_graphs[c].nodes])!=0:
        print('Condition {} graph contains driver genes'.format(c))
        flag = 0
if flag:
    print('None of the graphs learnt contain driver genes')

None of the graphs learnt contain driver genes


In [153]:
def markov_blanket(graph_oi, node_oi):
    parents = [x for x in graph_oi.predecessors(node_oi)]
    children = [x for x in graph_oi.successors(node_oi)]
    child_parents = []
    if len(children) != 0:
        for child in children:
            child_parents.extend([x for x in graph_oi.predecessors(child)])
    mb = np.unique(parents + children + child_parents)
    return mb

In [165]:
drugs_oi = ['high-dose Vitamin C', 'remdesivir', 'favipiravir', 'adalimumab', 'dihydro-artemisinin piperaquine', \
            'leflunomide', 'dipyridamole', 'chloroquine', 'hydroxychloroquine', 'suramin sodium', 'lopinavir',\
            'ritonavir', 'arbidol', 'umifenovir', 'IFN-alpha 2b','dexamethasone']

## GO/Phenotype of markov blanket & common subnetwork

In [166]:
# loading all datasets
path_ctd = '/Users/shrutikaushal/Desktop/Semester 1/Graphical Models/Final Project/CTD/'
chem2gene = pd.read_csv(path_ctd + 'CTD_chem_gene_ixns.tsv', sep = '\t')
chem2go = pd.read_csv(path_ctd + 'CTD_diseases_pathways.csv')
gene2go = pd.read_csv(path_ctd + 'CTD_genes_pathways.csv')
chem2gene2phene = pd.read_csv(path_ctd + 'CTD_Phenotype-Disease_biological_process_associations.csv')

In [None]:
def generate_wordcloud()

### GO terms for strongly connected components

In [187]:
go_conn_comp = {}
for c in conn_comp.keys():
    go_conn_comp[c] = list(gene2go[gene2go['GeneSymbol'].isin(conn_comp['Pneumonia'])]['PathwayName'])


In [190]:
len(np.unique(go_conn_comp['healthy_ctrl']))

206