# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
import pickle

In [None]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

## Synergy among Lab-derived Abnormal Phenotypes

In [None]:
with open('synergies.obj', 'rb') as synergies_file:
    deserialized = pickle.load(synergies_file)

In [None]:
len(deserialized)

Plot the synergy scores. They are all pretty small.

In [None]:
heart_failure = deserialized['428']
plt.hist(heart_failure.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_428.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_heart_failure = p_value_map['428']

In [None]:
data = heart_failure.pairwise_synergy_labeled_with_p_values(p_heart_failure)
data.reset_index(drop=True).head()

remove duplication record: (HP1, HP2) is the same to (HP2, HP1)

In [None]:
mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
S_heart_failure = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
S_heart_failure = S_heart_failure.loc[S_heart_failure.P1 < S_heart_failure.P2, :]
S_heart_failure['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in S_heart_failure.P1])
S_heart_failure['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in S_heart_failure.P2])
top5percent_synergy_pair_heart_failure = S_heart_failure.iloc[0:math.ceil(0.05 * len(S_heart_failure)), :]
top5percent_synergy_pair_heart_failure.head(2)

In [None]:
top5percent_synergy_pair_heart_failure.to_csv('top5percent_synergy_pair_heart_failure.csv', index=False)

In [None]:
acute_renal_failure = deserialized['584']
plt.hist(acute_renal_failure.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_584.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_renal_failure = p_value_map['584']
data = acute_renal_failure.pairwise_synergy_labeled_with_p_values(p_renal_failure)
data.reset_index(drop=True).head()

In [None]:
mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
S_acute_renal_failure = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
S_acute_renal_failure = S_acute_renal_failure.loc[S_acute_renal_failure.P1 < S_acute_renal_failure.P2, :]
S_acute_renal_failure['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in S_acute_renal_failure.P1])
S_acute_renal_failure['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in S_acute_renal_failure.P2])
top5percent_synergy_pair_acute_renal_failure = S_acute_renal_failure.iloc[0:math.ceil(0.05 * len(S_acute_renal_failure)), :]
top5percent_synergy_pair_acute_renal_failure.head(20)

In [None]:
top5percent_synergy_pair_acute_renal_failure.to_csv('top5percent_synergy_pair_acute_renal_failure.csv', index=False)

In [None]:
sepsis = deserialized['038']
plt.hist(sepsis.pairwise_synergy().flat, bins=10)
plt.show()

In [None]:
# load p values estimated from empirical distributions
path = '/Users/zhangx/git/MIMIC_HPO/src/main/resources/p_value_map_038.obj'
with open(path, 'rb') as f:
    p_value_map = pickle.load(f)
p_sepsis = p_value_map['038']
data = sepsis.pairwise_synergy_labeled_with_p_values(p_sepsis)
data.reset_index(drop=True).head()

In [None]:
mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
S_sepsis = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
S_sepsis = S_sepsis.loc[S_sepsis.P1 < S_sepsis.P2, :]
top5percent = S_sepsis.shape[0] * 0.05
top5percent_synergy_pair_sepsis = S_sepsis.reset_index(drop=True).iloc[0:int(top5percent), :]
top5percent_synergy_pair_sepsis['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_sepsis.P1])
top5percent_synergy_pair_sepsis['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in top5percent_synergy_pair_sepsis.P2])



In [None]:
top5percent_synergy_pair_sepsis.head(n = 100)

In [None]:
top5percent_synergy_pair_sepsis.to_csv('top5percent_synergy_pair_sepsis.csv', index=False)

## Synergy between Radiology- and Lab-derived Abnormal Phenotypes

In [None]:
with open('synergies_radiology_lab_primary_and_secondary.obj', 'rb') as synergies_file:
    synergies_rad_lab = pickle.load(synergies_file)

In [None]:
len(synergies_rad_lab)

In [None]:
def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

def filtered_synergy_dataframe(synergy, p_values=None, percentile_cut=None):
    if p_values is not None:
        data = synergy.pairwise_synergy_labeled_with_p_values(p_values)
    else:
        data = synergy.pairwise_synergy_labeled()
    # remove directly dependent terms 
    mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
    data_filtered = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
    data_filtered = data_filtered.loc[data_filtered.P1 < data_filtered.P2, :]
    data_filtered['P1_radiology_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P1])
    data_filtered['P2_lab_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P2])
    
    
    if (percentile_cut == None):
        percentile = 1
    else:
        percentile = percentile_cut / 100
    top_percentile = data_filtered.iloc[0:math.ceil(percentile * len(data_filtered)), :]
    
    
    I, II = synergy.mutual_information()
    Ia, Ib = I.values()
    p1_labels, p2_labels = synergy.vars_labels.values()
    mf_P1 = pd.DataFrame(data={'P1': p1_labels, 'mf_P1': Ia})
    mf_P2 = pd.DataFrame(data={'P2': p2_labels, 'mf_P2': Ib})
    mf_d_P1P2 = pd.DataFrame(data={'P1': np.repeat(p1_labels, len(p2_labels)), 'P2': np.tile(p2_labels, [len(p1_labels)]), 'mf_d_P1P2': II.flat})
    fully_labeled = top_percentile.merge(mf_P1, on='P1').merge(mf_P2, on='P2').merge(mf_d_P1P2, on=['P1', 'P2'])
    
    
    return fully_labeled

In [None]:
#p_values = load_p_values('p_value_428.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '428', icd_label = 'heart_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-428_primary_and_secondary.csv', index=False)

In [None]:
#p_value = load_p_values('p_value_584.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '584', icd_label = 'acute_renal_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology-labtest-584_primary_and_secondary.csv', index=False)

In [None]:
#p_values = load_p_values('p_value_038.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '038', icd_label = 'sepsis', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-038_primary_and_secondary.csv', index=False)

### Just look at primary diagnosis


In [None]:
mf_all = pd.read_csv('mutual_info_textHpo_labHpo.csv')
mf_all.head()

In [None]:
with open('synergies_radiology_lab_primary_diagnosis_only_corrected.obj', 'rb') as synergies_file:
    synergies_rad_lab_primary_only = pickle.load(synergies_file)

In [None]:
#p_values = load_p_values('p_value_428_primary_only.obj')
synergy = synergies_rad_lab_primary_only['428']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')

In [None]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-428_corrected.csv', index=False)

In [None]:
#p_values = load_p_values('p_value_584_primary_only.obj')
synergy = synergies_rad_lab_primary_only['584']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')

In [None]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-584_corrected.csv', index=False)

In [None]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_rad_lab_primary_only['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')

In [None]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-038_corrected.csv', index=False)