# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
import pickle

In [10]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

## Synergy between Radiology- and Lab-derived Abnormal Phenotypes

In [None]:
with open('synergies_radiology_lab_primary_and_secondary.obj', 'rb') as synergies_file:
    synergies_rad_lab = pickle.load(synergies_file)

In [None]:
len(synergies_rad_lab)

In [11]:
def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

def filtered_synergy_dataframe(synergy, p_values=None, percentile_cut=None):
    if p_values is not None:
        data = synergy.pairwise_synergy_labeled_with_p_values(p_values)
    else:
        data = synergy.pairwise_synergy_labeled()
    # remove directly dependent terms 
    mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
    data_filtered = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
    data_filtered = data_filtered.loc[data_filtered.P1 < data_filtered.P2, :]
    data_filtered['P1_radiology_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P1])
    data_filtered['P2_lab_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P2])
    
    
    if (percentile_cut == None):
        percentile = 1
    else:
        percentile = percentile_cut / 100
    top_percentile = data_filtered.iloc[0:math.ceil(percentile * len(data_filtered)), :]
    
    
    I, II = synergy.mutual_information()
    Ia, Ib = I.values()
    p1_labels, p2_labels = synergy.vars_labels.values()
    mf_P1 = pd.DataFrame(data={'P1': p1_labels, 'mf_d_P1': Ia})
    mf_P2 = pd.DataFrame(data={'P2': p2_labels, 'mf_d_P2': Ib})
    mf_d_P1P2 = pd.DataFrame(data={'P1': np.repeat(p1_labels, len(p2_labels)), 'P2': np.tile(p2_labels, [len(p1_labels)]), 'mf_d_P1P2': II.flat})
    fully_labeled = top_percentile.merge(mf_P1, on='P1').merge(mf_P2, on='P2').merge(mf_d_P1P2, on=['P1', 'P2'])
    
    return fully_labeled

In [None]:
#p_values = load_p_values('p_value_428.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '428', icd_label = 'heart_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-428_primary_and_secondary.csv', index=False)

In [None]:
#p_value = load_p_values('p_value_584.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '584', icd_label = 'acute_renal_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology-labtest-584_primary_and_secondary.csv', index=False)

In [None]:
#p_values = load_p_values('p_value_038.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '038', icd_label = 'sepsis', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-038_primary_and_secondary.csv', index=False)

### Just look at primary diagnosis


In [3]:
mf_all = pd.read_csv('mutual_info_textHpo_labHpo.csv')
mf_all.head()

Unnamed: 0,P1,P2,entropy_P1,entropy_P2,mf_P1_P2
0,HP:0000001,HP:0000118,0.367357,0.142722,0.011065
1,HP:0000001,HP:0000001,0.367357,0.142722,0.011065
2,HP:0000001,HP:0001939,0.367357,0.180924,0.014722
3,HP:0000001,HP:0001871,0.367357,0.20622,0.019528
4,HP:0000001,HP:0001877,0.367357,0.239249,0.017066


In [4]:
with open('synergies_radiology_lab_primary_only.obj', 'rb') as synergies_file:
    synergies_rad_lab_primary_only = pickle.load(synergies_file)

In [6]:
#p_values = load_p_values('p_value_428_primary_only.obj')
synergy = synergies_rad_lab_primary_only['428']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']


In [9]:
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']
include_overall_mf.head()


Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,RAD_RAD_HP:0002202,Lab_HP:0004363,0.002496,Pleural effusion,Abnormal circulating calcium concentration,0.002937,4.7e-05,0.005479,0.966136,0.869158,0.102352,0.9632,0.869111
1,RAD_RAD_HP:0001640,Lab_HP:0004363,0.001359,Cardiomegaly,Abnormal circulating calcium concentration,0.007596,4.7e-05,0.009002,0.819058,0.869158,0.036363,0.811462,0.869111
2,RAD_RAD_HP:0001635,Lab_HP:0004363,0.001114,Congestive heart failure,Abnormal circulating calcium concentration,0.011296,4.7e-05,0.012457,0.420952,0.869158,0.013604,0.409657,0.869111
3,RAD_RAD_HP:0000969,Lab_HP:0004363,0.000966,Edema,Abnormal circulating calcium concentration,0.001782,4.7e-05,0.002795,0.930309,0.869158,0.07126,0.928527,0.869111
4,RAD_RAD_HP:0002086,Lab_HP:0004363,0.000759,Abnormality of the respiratory system,Abnormal circulating calcium concentration,0.00193,4.7e-05,0.002736,0.671973,0.869158,0.080085,0.670043,0.869111


In [11]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-428_corrected.csv', index=False)

In [12]:
#p_values = load_p_values('p_value_584_primary_only.obj')
synergy = synergies_rad_lab_primary_only['584']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']

In [13]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-584_corrected.csv', index=False)

In [38]:
p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_rad_lab_primary_only['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=p_values, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']

In [41]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-038_corrected.csv', index=False)
edges_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'synergy', 'p', 'mf_d_P1_P2', 'mf_P1_P2']]
edges_cyto.to_csv('synergy_radiology_labtest_primary_only-038-edges.csv')
nodes_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'P1_radiology_label', 'P2_lab_label', 'mf_d_P1', 'mf_d_P2', 'entropy_P1', 'entropy_P2', 'entropy_P1_given_d', 'entropy_P2_given_d']]
nodes_cyto_unique = pd.DataFrame(data={'P': np.concatenate((nodes_cyto.P1, nodes_cyto.P2)), \
                                       'source': np.repeat(['RAD', 'LAB'], len(nodes_cyto)), \
                  'P_label': np.concatenate((nodes_cyto.P1_radiology_label, nodes_cyto.P2_lab_label)), \
                  'mf_d_P': np.concatenate((nodes_cyto.mf_d_P1, nodes_cyto.mf_d_P2)), \
                  'entropy': np.concatenate((nodes_cyto.entropy_P1, nodes_cyto.entropy_P2)), \
                  'conditional_entropy': np.concatenate((nodes_cyto.entropy_P1_given_d, nodes_cyto.entropy_P2_given_d))}).drop_duplicates()
nodes_cyto_unique.to_csv('synergy_radiology_labtest_primary_only-038-nodes.csv')

In [42]:
nodes_cyto_unique.head()

Unnamed: 0,P,source,P_label,mf_d_P,entropy,conditional_entropy
0,RAD_HP:0002107,RAD,Pneumothorax,0.0004283993,0.659875,0.659447
1,RAD_HP:0002097,RAD,Emphysema,9.916451e-07,0.376383,0.376382
42,RAD_HP:0001394,RAD,Cirrhosis,0.001630781,0.230345,0.228714
51,RAD_HP:0002170,RAD,Intracranial hemorrhage,0.0004663867,0.71773,0.717264
69,RAD_HP:0000001,RAD,All,0.003630368,0.367357,0.363727


## Synergy among Lab-derived Abnormal Phenotypes

In [50]:
with open('synergies-intra-labHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_labHpo = pickle.load(synergies_file)

In [51]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_labHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0001911,HP:0020064,0.005565,Abnormal granulocyte morphology,Abnormal eosinophil count,0.022505,0.000197,0.028266,0.176359,0.503569,0.000813,0.153855,0.503372
168,HP:0001880,HP:0001911,0.005565,Eosinophilia,Abnormal granulocyte morphology,0.000197,0.022505,0.028266,,,,,
110,HP:0001880,HP:0032309,0.005565,Eosinophilia,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
109,HP:0001879,HP:0032309,0.005565,Abnormal eosinophil morphology,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
169,HP:0002148,HP:0002905,0.004092,Hypophosphatemia,Hyperphosphatemia,0.009924,0.005304,0.01932,,,,,
117,HP:0001880,HP:0010974,0.003704,Eosinophilia,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
116,HP:0001879,HP:0010974,0.003704,Abnormal eosinophil morphology,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
1,HP:0010974,HP:0020064,0.003704,Abnormal myeloid leukocyte morphology,Abnormal eosinophil count,0.026392,0.000197,0.030292,0.183103,0.503569,0.000842,0.156711,0.503372
71,HP:0002904,HP:0011014,0.003596,Hyperbilirubinemia,Abnormal glucose homeostasis,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359
76,HP:0002904,HP:0011015,0.003596,Hyperbilirubinemia,Abnormal blood glucose concentration,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359


In [53]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_labHpo_038_primary_only.csv', index=False)

In [54]:
with open('synergies-intra-textHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_textHpo = pickle.load(synergies_file)

In [55]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_textHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0002107,HP:0002202,0.002421,Pneumothorax,Pleural effusion,0.000428,0.007408,0.010257,,,,,
2,HP:0002107,HP:0100750,0.001727,Pneumothorax,Atelectasis,0.000428,0.003481,0.005637,,,,,
4,HP:0002107,HP:0100806,0.001193,Pneumothorax,Sepsis,0.000428,0.048477,0.050099,,,,,
610,HP:0002090,HP:0002107,0.00116,Pneumonia,Pneumothorax,0.011828,0.000428,0.013417,,,,,
6,HP:0002107,HP:0100598,0.001033,Pneumothorax,Pulmonary edema,0.000428,0.006642,0.008104,,,,,
611,HP:0001640,HP:0002107,0.000739,Cardiomegaly,Pneumothorax,0.003249,0.000428,0.004416,,,,,
612,HP:0000969,HP:0002107,0.000611,Edema,Pneumothorax,0.004273,0.000428,0.005312,,,,,
8,HP:0002107,HP:0002878,0.000597,Pneumothorax,Respiratory failure,0.000428,0.006298,0.007324,,,,,
613,HP:0001945,HP:0002107,0.000516,Fever,Pneumothorax,0.01215,0.000428,0.013094,,,,,
10,HP:0002107,HP:0002835,0.00044,Pneumothorax,Aspiration,0.000428,0.003259,0.004127,,,,,


In [56]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_textHpo_038_primary_only.csv', index=False)