Phenotype Synergy Analysis

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
import pickle

In [None]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')
mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])

In [3]:
with open('synergies.obj', 'rb') as synergies_file:
    deserialized = pickle.load(synergies_file)

In [4]:
len(deserialized)

31

Plot the synergy scores. They are all pretty small.

In [None]:
heart_failure = deserialized['428']
plt.hist(heart_failure.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_428.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_heart_failure = p_value_map['428']

In [None]:
data = heart_failure.pairwise_synergy_labeled_with_p_values(p_heart_failure)
data.reset_index(drop=True).head()

remove duplication record: (HP1, HP2) is the same to (HP2, HP1)

In [None]:
S_heart_failure = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
S_heart_failure = S_heart_failure.loc[S_heart_failure.P1 < S_heart_failure.P2, :]
top5percent_synergy_pair_heart_failure['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_heart_failure.P1])
top5percent_synergy_pair_heart_failure['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_heart_failure.P2])
top5percent_synergy_pair_heart_failure.head(20)

In [None]:
top5percent_synergy_pair_heart_failure.to_csv('top5percent_synergy_pair_heart_failure.csv', index=False)

In [None]:
acute_renal_failure = deserialized['584']
plt.hist(acute_renal_failure.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_584.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_renal_failure = p_value_map['584']
data = acute_renal_failure.pairwise_synergy_labeled_with_p_values(p_renal_failure)
data.reset_index(drop=True).head()

In [None]:
top5percent_synergy_pair_acute_renal_failure.head(20)

In [None]:
top5percent_synergy_pair_acute_renal_failure.to_csv('top5percent_synergy_pair_acute_renal_failure.csv', index=False)

In [None]:
sepsis = deserialized['038']
plt.hist(sepsis.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_038.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_sepsis = p_value_map['038']
data = sepsis.pairwise_synergy_labeled_with_p_values(p_sepsis)
data.reset_index(drop=True).head()

In [None]:
mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
S_sepsis = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
S_sepsis = S_sepsis.loc[S_sepsis.P1 < S_sepsis.P2, :]
top5percent = S_sepsis.shape[0] * 0.05
top5percent_synergy_pair_sepsis = S_sepsis.reset_index(drop=True).iloc[0:int(top5percent), :]
top5percent_synergy_pair_sepsis['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_sepsis.P1])
top5percent_synergy_pair_sepsis['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_sepsis.P2])



In [None]:
top5percent_synergy_pair_sepsis.head(n = 100)

In [None]:
top5percent_synergy_pair_sepsis.to_csv('top5percent_synergy_pair_sepsis.csv', index=False)