In [8]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle 
import causallearn as cl
import cdt
import os
import matplotlib.pyplot as plt

This script focuses on learning graphs for each of the conditions using different structure learning algorithms. Two types of algorithms are used - score based greedy algorithms and conditional independence test based algorithms

# Data Formatting

Selecting subsets of data that only include the genes of interest

In [26]:
path = os.getcwd()+"/normalized data/"

In [71]:
genes_oi = pd.read_csv(path+'genes_of_interest.csv')

resp_df = pd.read_csv(path+'respiratory_data.csv')
resp_meta = pd.read_csv(path+'respiratory_metadata.csv')

covid_df = pd.read_csv(path+'covid_data.csv')
covid_meta = pd.read_csv(path+'covid_metadata.csv')

In [80]:
covid_meta['sample'] = [x.split('-')[0] for x in covid_meta['sample']]

In [81]:
# selecting subsets of data for structure learning
resp_oi_dict = {}
for c in np.unique(resp_meta.condition):
    temp_df = resp_df[(resp_df['genes'].isin(genes_oi['genes'].to_list()))][['genes']+resp_meta[resp_meta['condition'] == c]['sample'].to_list()]
    temp_df = temp_df.T
    temp_df.columns = temp_df.loc['genes']
    temp_df.drop(['genes'], axis = 0, inplace = True)
    resp_oi_dict[c] = temp_df
    
covid_oi_dict = {}
for c in np.unique(covid_meta.condition):
    temp_df = covid_df[(covid_df['genes'].isin(genes_oi['genes'].to_list()))][['genes']+covid_meta[covid_meta['condition'] == c]['sample'].to_list()]
    temp_df = temp_df.T
    temp_df.columns = temp_df.loc['genes']
    temp_df.drop(['genes'], axis = 0, inplace = True)
    covid_oi_dict[c] = temp_df

# Structure learning - Causal graphs

Two different types of algorithms are used to learn causal graphs - condition based and score based. Condition based include PC with default settings and alpha equal to 0.05 and Grow-Shrink algorithm from bnlearn package in R, also with default settings.

Scored based include GES with default settings.

### PC algorithm

In [123]:
from cdt.causality.graph import PC
pc_graphs_dict = {}
obj = PC(alpha = 0.05)

for c in list(resp_oi_dict.keys())+list(covid_oi_dict.keys()):
    if c in resp_oi_dict.keys():
        output = obj.create_graph_from_data(resp_oi_dict[c])
    else:
        output = obj.create_graph_from_data(covid_oi_dict[c])
    pc_graphs_dict[c] = output
    print('Graph learnt for {} has {} edges'.format(c, len(output.edges)))

Graph learnt for Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus has 72 edges
Graph learnt for Dengue has 76 edges
Graph learnt for Influenza has 148 edges
Graph learnt for Parainfluenza_RespiratorySyncytial has 78 edges
Graph learnt for Pneumonia has 72 edges
Graph learnt for Rhinovirus has 86 edges
Graph learnt for healthy_ctrl has 70 edges
Graph learnt for Critical has 132 edges
Graph learnt for Non-critical has 102 edges


### Grow-Shrink algorithm

In [127]:
from cdt.causality.graph.bnlearn import GS
gs_graphs_dict = {}
obj = GS()

for c in list(resp_oi_dict.keys())+list(covid_oi_dict.keys()):
    if c in resp_oi_dict.keys():
        output = obj.create_graph_from_data(resp_oi_dict[c])
    else:
        output = obj.create_graph_from_data(covid_oi_dict[c])
    gs_graphs_dict[c] = output
    print('Graph learnt for {} has {} edges'.format(c, len(output.edges)))

Graph learnt for Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus has 51 edges
Graph learnt for Dengue has 61 edges
Graph learnt for Influenza has 108 edges
Graph learnt for Parainfluenza_RespiratorySyncytial has 48 edges
Graph learnt for Pneumonia has 68 edges
Graph learnt for Rhinovirus has 79 edges
Graph learnt for healthy_ctrl has 53 edges
Graph learnt for Critical has 110 edges
Graph learnt for Non-critical has 57 edges


### GES algorithm

In [124]:
from cdt.causality.graph import GES
ges_graphs_dict = {}
obj = GES(score = 'obs')

for c in list(resp_oi_dict.keys())+list(covid_oi_dict.keys()):
    if c in resp_oi_dict.keys():
        output = obj.create_graph_from_data(resp_oi_dict[c])
    else:
        output = obj.create_graph_from_data(covid_oi_dict[c])
    ges_graphs_dict[c] = output
    print('Graph learnt for {} has {} edges'.format(c, len(output.edges)))

Graph learnt for Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus has 1085 edges
Graph learnt for Dengue has 1351 edges
Graph learnt for Influenza has 668 edges
Graph learnt for Parainfluenza_RespiratorySyncytial has 1216 edges
Graph learnt for Pneumonia has 1166 edges
Graph learnt for Rhinovirus has 1460 edges
Graph learnt for healthy_ctrl has 1284 edges
Graph learnt for Critical has 1052 edges
Graph learnt for Non-critical has 1223 edges


In [None]:
writer = pd.ExcelWriter(os.getcwd()+'/causal graphs/GES_graphs.xlsx', engine='xlsxwriter')

df1.to_excel(writer, sheet_name='Sheet1')  # Default position, cell A1.
df2.to_excel(writer, sheet_name='Sheet1', startcol=3)
df3.to_excel(writer, sheet_name='Sheet1', startrow=6)

writer.save()