# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
from obonet.ontology import Ontology
import pickle

In [2]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')
hpo2 = Ontology('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

# Mutual information without considering diagnosis

In [3]:
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_labHpo.obj', 'rb') as f:
    summary_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_textHpo.obj', 'rb') as f:
    summary_textHpo_textHpo = pickle.load(f)  
with open('../../../data/mf_regardless_of_diseases/summary_labHpo_labHpo.obj', 'rb') as f:
    summary_labHpo_labHpo = pickle.load(f)

In [4]:
mf_textHpo_labHpo = mf.MutualInfoXY(summary_textHpo_labHpo)
mf_textHpo_textHpo = mf.MutualInfoXY(summary_textHpo_textHpo)
mf_labHpo_labHpo = mf.MutualInfoXY(summary_labHpo_labHpo)

## TextHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [3]:
df_mf_textHpo_labHpo = mf_textHpo_labHpo.mf_labeled()
# add labels
df_mf_textHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P1])
df_mf_textHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[df_mf_textHpo_labHpo.P1 < df_mf_textHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_labHpo.P1[i], df_mf_textHpo_labHpo.P2[i]) for i in np.arange(df_mf_textHpo_labHpo.shape[0])])
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
#df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_labHpo.csv')
df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).head(n=50)

NameError: name 'mf_textHpo_labHpo' is not defined

## TextHpo -- TextHpo
Their mutual information tells how much they correlate with each other.

In [6]:
df_mf_textHpo_textHpo = mf_textHpo_textHpo.mf_labeled()
# add labels
df_mf_textHpo_textHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P1])
df_mf_textHpo_textHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[df_mf_textHpo_textHpo.P1 < df_mf_textHpo_textHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_textHpo.P1[i], df_mf_textHpo_textHpo.P2[i]) for i in np.arange(df_mf_textHpo_textHpo.shape[0])])
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_textHpo.csv')
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
12173,HP:0001892,HP:0011028,0.706285,Abnormal bleeding,Abnormality of blood circulation
5481,HP:0011947,HP:0012649,0.625522,Respiratory tract infection,Increased inflammatory response
5480,HP:0011947,HP:0012647,0.625481,Respiratory tract infection,Abnormal inflammatory response
3229,HP:0010978,HP:0011947,0.565861,Abnormality of immune system physiology,Respiratory tract infection
11385,HP:0011024,HP:0025033,0.543392,Abnormality of the gastrointestinal tract,Abnormality of digestive system morphology
2329,HP:0002715,HP:0011947,0.481571,Abnormality of the immune system,Respiratory tract infection
1565,HP:0000969,HP:0002103,0.455301,Edema,Abnormality of the pleura
2876,HP:0002103,HP:0011032,0.451888,Abnormality of the pleura,Abnormality of fluid regulation
12690,HP:0011029,HP:0100659,0.450224,Internal hemorrhage,Abnormality of the cerebral vasculature
12524,HP:0011028,HP:0100659,0.447784,Abnormality of blood circulation,Abnormality of the cerebral vasculature


## LabHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [115]:
df_mf_labHpo_labHpo = mf_labHpo_labHpo.mf_labeled()
# add labels
df_mf_labHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P1])
df_mf_labHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[df_mf_labHpo_labHpo.P1 < df_mf_labHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_labHpo_labHpo.P1[i], df_mf_labHpo_labHpo.P2[i]) for i in np.arange(df_mf_labHpo_labHpo.shape[0])])
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_labHpo_labHpo.csv')
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
0,HP:0002157,HP:0031970,0.744493,Azotemia,Abnormal blood urea nitrogen concentration
1,HP:0020061,HP:0031850,0.575781,Abnormal hemoglobin concentration,Abnormal hematocrit
2,HP:0020062,HP:0031851,0.546701,Decreased hemoglobin concentration,Reduced hematocrit
3,HP:0020058,HP:0020061,0.543036,Abnormal red blood cell count,Abnormal hemoglobin concentration
4,HP:0020058,HP:0031850,0.533428,Abnormal red blood cell count,Abnormal hematocrit
5,HP:0020060,HP:0020062,0.514094,Decreased red blood cell count,Decreased hemoglobin concentration
6,HP:0500164,HP:0500165,0.499272,Abnormal blood carbon dioxide level,Abnormal blood oxygen level
7,HP:0020060,HP:0031851,0.48535,Decreased red blood cell count,Reduced hematocrit
8,HP:0020061,HP:0031851,0.433088,Abnormal hemoglobin concentration,Reduced hematocrit
9,HP:0001882,HP:0004332,0.417853,Leukopenia,Abnormal lymphocyte morphology


## Mutual information between textHpo and labHpo in respect to diagnoses

## Only consider primary diagnosis

In [3]:
def mf_dataframes(mf_diagnosis_phenotypes, p_mf_Xz, p_mf_Yz, p_mf_XY_z, p_mf_XY_given_z, p_synergy, p_mf_XY_omit_z):
    X_labels, Y_labels = mf_diagnosis_phenotypes.vars_labels.values()
    M1 = len(X_labels)
    M2 = len(Y_labels)

    mf_Xz = mf_diagnosis_phenotypes.mutual_info_Xz()
    mf_Yz = mf_diagnosis_phenotypes.mutual_info_Yz()

    # mutual information between single phenotypes and diagnosis
    df_mf_Xz = pd.DataFrame(data={'X': X_labels, 'mf_Xz': mf_Xz})
    df_mf_Yz = pd.DataFrame(data={'Y': Y_labels, 'mf_Yz': mf_Yz})
    
    # joint and conditional mutual information, and synergy
    mf_XY_z = mf_diagnosis_phenotypes.mutual_info_XY_z()
    mf_XY_given_z = mf_diagnosis_phenotypes.mutual_info_XY_given_z()
    mf_synergy = mf_diagnosis_phenotypes.synergy_XY2z()
    
    # mutual information between phenotypes without considering diagnosis
    mf_XY_omit_z = mf_diagnosis_phenotypes.mutual_info_XY_omit_z()
    
    # mutual information between phenotype pairs and diagnosis
    df_mf_XY_z = pd.DataFrame()
    df_mf_XY_z['X'] = np.repeat(X_labels, M2)
    df_mf_XY_z['Y'] = np.tile(Y_labels, [M1])
    df_mf_XY_z['mf_XY_z'] = mf_XY_z.flat
    df_mf_XY_z['mf_XY_given_z'] = mf_XY_given_z.flat
    df_mf_XY_z['synergy'] = mf_synergy.flat
    
    # mutual information between phenotypes after omiting diagnosis
    df_mf_XY_z['mf_XY_omit_z'] = mf_XY_omit_z.flat
    
    # add p values
    if p_mf_Xz is not None:
        df_mf_Xz['p_mf_Xz'] = p_mf_Xz
    if p_mf_Yz is not None:
        df_mf_Yz['p_mf_Yz'] = p_mf_Yz
    if p_mf_XY_z is not None:
        df_mf_XY_z['p_mf_XY_z'] = p_mf_XY_z.flat
    if p_mf_XY_given_z is not None:
        df_mf_XY_z['p_mf_XY_given_z'] = p_mf_XY_given_z.flat
    if p_synergy is not None:
        df_mf_XY_z['p_synergy'] = p_synergy.flat
    if p_mf_XY_omit_z is not None:
        df_mf_XY_z['p_mf_XY_omit_z'] = p_mf_XY_omit_z.flat

    return df_mf_Xz, df_mf_Yz, df_mf_XY_z

def filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z):
    df_merged = df_mf_XY_z \
        .merge(df_mf_Xz, how='left', on=['X']) \
        .merge(df_mf_Yz, how='left', on=['Y'])

    ## filter out identifical pairs: a, b is the same as b, a
    df_filtered = df_merged.loc[df_merged.X < df_merged.Y, :].reset_index(drop=True)
    mask = np.array([hpo.has_dependency(df_filtered.X[i], df_filtered.Y[i]) for i in np.arange(len(df_filtered))])
    df_filtered = df_filtered.loc[np.logical_not(mask), ].reset_index(drop=True)
    return df_filtered

def entropy(case, control):
    total = case + control 
    h = -(case / total * np.log2(case/total) + control/total * np.log2(control/total))
    return h
    

def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

convert_to_percent = np.vectorize(lambda x: ' {:.2f}%'.format(x * 100))

In [4]:
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_textHpo_labHpo.obj', 'rb') as f:
    summaries_diagnosis_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_textHpo_textHpo.obj', 'rb') as f:
    summaries_diagnosis_textHpo_textHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_labHpo_labHpo.obj', 'rb') as f:
    summaries_diagnosis_labHpo_labHpo = pickle.load(f)

### textHpo-labHpo pairs

In [24]:
disease = '584'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/primary_only/{}/p_value_textHpo_labHpo_{}_primary_only.obj'.format(disease, disease))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [25]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,+--,-++,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz
0,HP:0002086,HP:0032180,0.002105,0.108345,-0.000387,0.108733,0.0,0.0,0.0,0.0,...,4096,59,5135,2,5203,58976,0.000426,0.0,0.002066,0.0
1,HP:0002086,HP:0012337,0.002393,0.110703,-0.000228,0.110931,0.0,0.0,6.2e-05,0.0,...,6511,59,4163,2,6175,58976,0.000426,0.0,0.002194,0.0
2,HP:0002086,HP:0003111,0.002873,0.09911,-0.000362,0.099472,0.0,0.0,0.0,0.0,...,8008,57,3969,4,6369,58976,0.000426,0.0,0.002809,0.0
3,HP:0002086,HP:0004364,0.008218,0.055234,-0.000412,0.055647,0.0,0.0,0.0,0.0,...,20453,54,2256,7,8082,58976,0.000426,0.0,0.008204,0.0
4,HP:0002086,HP:0020061,0.002196,0.099485,-0.00023,0.099715,0.0,0.0,0.0,0.0,...,9022,54,3656,7,6682,58976,0.000426,0.0,0.002,0.0


In [7]:
p_values.keys()

dict_keys(['mf_Xz', 'mf_Yz', 'mf_XY_z', 'mf_XY_given_z', 'synergy', 'mf_XY_omit_z'])

In [7]:
# mutual information between textHpo and diagnosis
df_mf_textHpo_diagnosis = df_mf_Xz \
    .assign(X_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Xz.X])) \
    .sort_values(by='mf_Xz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P', 'mf_Xz': 'mf_P_diag', 'p_mf_Xz': 'p_mf_P_diag', 'X_label': 'P_label'})

df_mf_textHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/primary_only/mf_textHpo_diag_{}.csv'.format(disease))
df_mf_textHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0001919,0.009108,0.0,Acute kidney injury
1,HP:0000083,0.005441,0.0,Renal insufficiency
2,HP:0000126,0.002024,0.0,Hydronephrosis
3,HP:0002615,0.001067,0.0,Hypotension
4,HP:0000001,0.000879,0.0,All


In [8]:
# mutual information between labHpo and diagnosis
df_mf_labHpo_diagnosis = df_mf_Yz \
    .assign(Y_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Yz.Y])) \
    .sort_values(by='mf_Yz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'Y': 'P', 'mf_Yz': 'mf_P_diag', 'p_mf_Yz': 'p_mf_P_diag', 'Y_label': 'P_label'})

df_mf_labHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/primary_only/mf_labHpo_diag_{}.csv'.format(disease))
df_mf_labHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0003259,0.013763,0.0,Elevated serum creatinine
1,HP:0012100,0.011585,0.0,Abnormal circulating creatinine level
2,HP:0002157,0.009256,0.0,Azotemia
3,HP:0003138,0.008788,0.0,Increased blood urea nitrogen
4,HP:0004364,0.008204,0.0,Abnormal circulating nitrogen compound concent...


In [26]:
# save synergies
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.rename(columns={'X':'P1', 'Y': 'P2'}) \
    .sort_values(by=['synergy'], ascending=False) \
    .reset_index(drop=True) \
    .loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'synergy', 'p_mf_Xz', 'p_mf_Yz', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .rename(columns={'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_textHpo_labHpo_{}.csv'.format(disease))

In [27]:
# save ratios between conditional mutual info and overall mutual info
df_mf_vs_conditional_mf = df_mf_XY_z_filtered \
    .assign(mf_ratio=df_mf_XY_z_filtered.mf_XY_given_z/df_mf_XY_z_filtered.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2'})
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_textHpo_labHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head(n=10)

Unnamed: 0,P1,P2,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001399,HP:0032232,0.000139,3.7e-05,3.7e-05,1.63598e-08,0.000938,0.015312,6.2e-05,0.015312,...,579,53865,58976,9.040104e-05,0.001125,1.2e-05,0.90325,Hepatic failure,Increased circulating creatine kinase MB isoform,2265.113013
1,HP:0002097,HP:0025066,0.000409,4.3e-05,4.3e-05,1.983869e-08,0.0,0.03925,0.000437,0.039687,...,534,50204,58976,8.1548e-05,0.004437,0.000284,0.0,Emphysema,Decreased mean corpuscular volume,2173.488437
2,HP:0001677,HP:0040088,0.000585,2.2e-05,2.2e-05,2.189253e-08,0.0,0.039625,0.000563,0.039625,...,478,47612,58976,7.276097e-05,0.007062,0.00049,0.0,Coronary artery atherosclerosis,Abnormal lymphocyte count,1009.716383
3,HP:0001677,HP:0004332,0.000585,2.2e-05,2.2e-05,2.189253e-08,0.0,0.04075,0.000563,0.04075,...,478,47612,58976,7.276097e-05,0.007062,0.00049,0.0,Coronary artery atherosclerosis,Abnormal lymphocyte morphology,1009.716383
4,HP:0001677,HP:0001894,0.000181,2.7e-05,2.7e-05,7.075146e-08,0.000125,0.010563,0.000125,0.011125,...,585,50428,58976,7.276097e-05,0.007062,8.1e-05,0.004,Coronary artery atherosclerosis,Thrombocytosis,382.447648
5,HP:0002595,HP:0032232,5e-05,3.8e-05,3.8e-05,2.904079e-07,0.039188,0.014875,6.2e-05,0.195,...,579,52985,58976,2.127124e-07,0.112437,1.2e-05,0.90325,Ileus,Increased circulating creatine kinase MB isoform,130.346545
6,HP:0002097,HP:0031962,0.003671,1.9e-05,1.9e-05,1.691589e-07,0.0,0.083125,0.001125,0.083125,...,460,51303,58976,8.1548e-05,0.004437,0.00357,0.0,Emphysema,Elevated serum anion gap,114.691333
7,HP:0001744,HP:0500020,0.001114,3.5e-05,3.4e-05,1.575653e-06,0.0,0.057937,0.000563,0.214562,...,443,48950,58976,0.0001682294,0.0,0.000912,0.0,Splenomegaly,Abnormal cardiac biomarker test,22.517344
8,HP:0001744,HP:0500015,0.001114,3.5e-05,3.4e-05,1.575653e-06,0.0,0.061875,0.000687,0.219063,...,443,48950,58976,0.0001682294,0.0,0.000912,0.0,Splenomegaly,Abnormal cardiac test,22.517344
9,HP:0001297,HP:0002153,0.002877,1e-06,1e-06,6.348381e-08,0.0,0.123938,0.0085,0.123938,...,403,48483,58976,1.943955e-05,0.057937,0.002856,0.0,Stroke,Hyperkalemia,18.899975


In [11]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)


# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/edges_textHpo_labHpo_{}.csv'.format(disease))

In [12]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label]),
                          'type': np.repeat(['Rad', 'Lab'], len(df_4_cytoscape))}).drop_duplicates()
nodes.drop_duplicates().reset_index(drop=True).to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/nodes_textHpo_labHpo_{}.csv'.format(disease))

### labHpo-labHpo pairs

In [28]:
disease = '584'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_labHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/primary_only/{}/p_value_labHpo_labHpo_{}_primary_only.obj'.format(disease, disease))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_labHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [29]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
#df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True)
    #.rename(columns={'mf_joint': 'mf_P1P2_diag'}) \
    #.loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.loc[:, ['X', 'Y', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'p_mf_Xz', 'p_mf_Yz', 'P1_label', 'P2_label']] \
    .rename(columns={'X': 'P1', 'Y': 'P2', 'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_joint_vs_individual_labHpo_labHpo_{}.csv'.format(disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0003259,HP:0012419,0.015717,0.007693,0.000961,0.006732,0.0,0.0,0.0,0.0,...,10472,111,36068,58976,0.013763,0.0,0.000993,0.0,Elevated serum creatinine,Hyperoxemia
1,HP:0003259,HP:0500165,0.015573,0.017399,0.001335,0.016065,0.0,0.0,0.0,0.0,...,15057,105,31483,58976,0.013763,0.0,0.000475,0.0,Elevated serum creatinine,Abnormal blood oxygen level
2,HP:0003259,HP:0032369,0.014925,0.006692,0.00068,0.006012,0.0,0.0,0.0,0.0,...,4816,115,41724,58976,0.013763,0.0,0.000481,0.0,Elevated serum creatinine,Alkalemia
3,HP:0003259,HP:0012418,0.014851,0.028632,0.001037,0.027595,0.0,0.0,0.0,0.0,...,9155,108,37385,58976,0.013763,0.0,5e-05,0.016194,Elevated serum creatinine,Hypoxemia
4,HP:0003259,HP:0032066,0.01477,0.085752,-0.004779,0.090531,0.0,0.0,0.0,0.0,...,6412,75,40128,58976,0.013763,0.0,0.005786,0.0,Elevated serum creatinine,Decreased serum bicarbonate concentration


In [44]:
entropy(mf_diagnosis_phenotypes.case_N, mf_diagnosis_phenotypes.control_N) 

0.3172947974778557

In [15]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2','mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_labHpo_labHpo_{}.csv'.format(disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
1089,HP:0004360,HP:0500165,0.005816,0.260868,0.002358,0.258509,0.0,0.0,0.0,0.0,...,1890,111,25368,58976,0.002983,0.0,0.000475,0.0,Abnormality of acid-base homeostasis,Abnormal blood oxygen level
1110,HP:0004360,HP:0012419,0.00579,0.159317,0.001815,0.157502,0.0,0.0,0.0,0.0,...,1217,112,26041,58976,0.002983,0.0,0.000993,0.0,Abnormality of acid-base homeostasis,Hyperoxemia
1024,HP:0012100,HP:0500165,0.013639,0.0346,0.001579,0.033021,0.0,0.0,0.0,0.0,...,11656,64,28537,58976,0.011585,0.0,0.000475,0.0,Abnormal circulating creatinine level,Abnormal blood oxygen level
924,HP:0003138,HP:0500165,0.010742,0.047989,0.00148,0.04651,0.0,0.0,0.0,0.0,...,8387,45,24735,58976,0.008788,0.0,0.000475,0.0,Increased blood urea nitrogen,Abnormal blood oxygen level
608,HP:0002157,HP:0500165,0.011201,0.04621,0.00147,0.04474,0.0,0.0,0.0,0.0,...,8079,32,24032,58976,0.009256,0.0,0.000475,0.0,Azotemia,Abnormal blood oxygen level
1082,HP:0004360,HP:0012415,0.004375,0.281864,0.001385,0.280478,0.0,0.0,0.0,0.0,...,3172,107,24086,58976,0.002983,0.0,7e-06,0.844572,Abnormality of acid-base homeostasis,Abnormal blood gas level
3181,HP:0002795,HP:0004360,0.004375,0.281864,0.001385,0.280478,0.0,0.0,0.0,0.0,...,8937,107,24086,58976,7e-06,0.855643,0.002983,0.0,Functional respiratory abnormality,Abnormality of acid-base homeostasis
3282,HP:0002086,HP:0004360,0.004375,0.281864,0.001385,0.280478,0.0,0.0,0.0,0.0,...,8937,107,24086,58976,7e-06,0.831984,0.002983,0.0,Abnormality of the respiratory system,Abnormality of acid-base homeostasis
873,HP:0031970,HP:0500165,0.009906,0.047756,0.001364,0.046392,0.0,0.0,0.0,0.0,...,7791,43,23694,58976,0.008067,0.0,0.000475,0.0,Abnormal blood urea nitrogen concentration,Abnormal blood oxygen level
2761,HP:0032065,HP:0500165,0.006759,0.056839,0.001363,0.055476,0.0,0.0,0.0,0.0,...,10828,174,29314,58976,0.004921,0.0,0.000475,0.0,Abnormal serum bicarbonate concentration,Abnormal blood oxygen level


In [16]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001939,HP:0020061,0.002186,0.204908,-0.000996,0.205904,0.0,0.0,0.0,0.0,...,134,6,6059,58976,0.001183,0.0,0.002,0.0,Abnormality of metabolism/homeostasis,Abnormal hemoglobin concentration
1,HP:0001939,HP:0020058,0.002044,0.205877,-0.000957,0.206834,0.0,0.0,0.0,0.0,...,138,6,6055,58976,0.001183,0.0,0.001819,0.0,Abnormality of metabolism/homeostasis,Abnormal red blood cell count
2,HP:0001939,HP:0031850,0.00207,0.202409,-0.000969,0.203378,0.0,0.0,0.0,0.0,...,192,6,6001,58976,0.001183,0.0,0.001856,0.0,Abnormality of metabolism/homeostasis,Abnormal hematocrit
3,HP:0001939,HP:0031851,0.002175,0.153438,-0.00079,0.154228,0.0,0.0,0.0,0.0,...,85,6,6108,58976,0.001183,0.0,0.001783,0.0,Abnormality of metabolism/homeostasis,Reduced hematocrit
4,HP:0001939,HP:0002715,0.001311,0.078193,-0.000224,0.078417,0.0,0.0,0.0,0.0,...,693,7,5500,58976,0.001183,0.0,0.000353,0.0,Abnormality of metabolism/homeostasis,Abnormality of the immune system


In [30]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_labHpo_labHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001873,HP:0025066,0.000518,6.9e-05,6.8e-05,3.474868e-07,0.0,0.007465,0.003036,0.039916,...,366,39608,58976,0.000166,0.000316,0.000284,0.0,Thrombocytopenia,Decreased mean corpuscular volume,197.692069
1,HP:0002904,HP:0500015,0.000954,3e-05,3e-05,4.838742e-07,0.0,0.021318,0.00272,0.149924,...,411,43917,58976,1.2e-05,0.963626,0.000912,0.0,Hyperbilirubinemia,Abnormal cardiac test,61.989954
2,HP:0002904,HP:0500020,0.000954,3e-05,3e-05,4.838742e-07,0.0,0.020433,0.002341,0.149987,...,411,43917,58976,1.2e-05,0.963626,0.000912,0.0,Hyperbilirubinemia,Abnormal cardiac biomarker test,61.989954
3,HP:0001626,HP:0002904,0.000954,3e-05,3e-05,4.838742e-07,0.0,0.021698,0.001455,0.146698,...,411,43917,58976,0.000912,0.0,1.2e-05,0.948887,Abnormality of the cardiovascular system,Hyperbilirubinemia,61.989954
4,HP:0002904,HP:0410174,0.001023,3.5e-05,3.5e-05,6.070791e-07,0.0,0.019421,0.002847,0.163209,...,416,44468,58976,1.2e-05,0.963626,0.000975,0.0,Hyperbilirubinemia,Increased troponin T level in blood,58.40817


In [18]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Lab_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/edges_labHpo_labHpo_{}.csv'.format(disease))

In [19]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Lab', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/nodes_labHpo_labHpo_{}.csv'.format(disease))

### textHpo-textHpo pairs

In [31]:
disease = '584'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_textHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/primary_only/{}/p_value_textHpo_textHpo_{}_primary_only.obj'.format(disease, disease))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_textHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [32]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'mf_P1P2_diag','p_mf_P1_diag', 'p_mf_P2_diag', 'p_mf_P1P2_diag', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_joint_vs_individual_textHpo_textHpo_{}.csv'.format(disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,P1,P2,mf_P1_diag,mf_P2_diag,mf_P1P2_diag,p_mf_P1_diag,p_mf_P2_diag,p_mf_P1P2_diag,P1_label,P2_label
0,HP:0001919,HP:0002107,0.009108,0.000814486,0.010247,0.0,0.0,0.0,Acute kidney injury,Pneumothorax
1,HP:0001919,HP:0100750,0.009108,6.134718e-05,0.009818,0.0,0.059207,0.0,Acute kidney injury,Atelectasis
2,HP:0001919,HP:0002202,0.009108,2.90376e-07,0.009705,0.0,0.241532,0.0,Acute kidney injury,Pleural effusion
3,HP:0001919,HP:0002615,0.009108,0.001067433,0.00967,0.0,0.0,0.0,Acute kidney injury,Hypotension
4,HP:0001919,HP:0001945,0.009108,3.598422e-05,0.00966,0.0,0.037903,0.0,Acute kidney injury,Fever


In [7]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_textHpo_textHpo_{}.csv'.format(disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
14652,HP:0001919,HP:0100750,0.009818,0.008211,0.000648,0.007562,0.0,0.0,0.0,0.0,...,25939,297,31067,58976,0.009108,0.0,6.134718e-05,0.059207,Acute kidney injury,Atelectasis
14653,HP:0001919,HP:0002202,0.009705,0.011263,0.000596,0.010666,0.0,0.0,0.0,0.0,...,21833,309,35173,58976,0.009108,0.0,2.90376e-07,0.241532,Acute kidney injury,Pleural effusion
7588,HP:0000083,HP:0100750,0.006098,0.024308,0.000595,0.023713,0.0,0.0,0.0,0.0,...,22061,255,29291,58976,0.005441,0.0,6.134718e-05,0.059207,Renal insufficiency,Atelectasis
14689,HP:0001919,HP:0001945,0.00966,0.006197,0.000515,0.005682,0.0,0.0,0.0,0.0,...,8365,396,48641,58976,0.009108,0.0,3.598422e-05,0.037903,Acute kidney injury,Fever
7649,HP:0000083,HP:0002107,0.00674,0.005647,0.000484,0.005163,0.0,0.0,0.0,0.0,...,8177,337,43175,58976,0.005441,0.0,0.000814486,0.0,Renal insufficiency,Pneumothorax
7620,HP:0000083,HP:0011029,0.005878,0.024493,0.000424,0.024068,0.0,0.0,0.0,0.0,...,9384,280,41968,58976,0.005441,0.0,1.26815e-05,0.968011,Renal insufficiency,Internal hemorrhage
7648,HP:0000083,HP:0100545,0.005822,0.019674,0.000374,0.019299,0.0,0.0,0.0,0.0,...,4575,313,46777,58976,0.005441,0.0,5.797212e-06,0.55793,Renal insufficiency,Arterial stenosis
7618,HP:0000083,HP:0011028,0.005839,0.024537,0.000373,0.024165,0.0,0.0,0.0,0.0,...,9436,278,41916,58976,0.005441,0.0,2.461416e-05,0.639718,Renal insufficiency,Abnormality of blood circulation
7617,HP:0000083,HP:0001892,0.00584,0.024757,0.000372,0.024385,0.0,0.0,0.0,0.0,...,9532,277,41820,58976,0.005441,0.0,2.658345e-05,0.454906,Renal insufficiency,Abnormal bleeding
7589,HP:0000083,HP:0002202,0.00581,0.027863,0.000369,0.027494,0.0,0.0,0.0,0.0,...,18313,263,33039,58976,0.005441,0.0,2.90376e-07,0.241532,Renal insufficiency,Pleural effusion


In [20]:
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001873,HP:0025547,0.007482,0.000192,0.000191,8.506363e-07,0.0,0.000389,6.5e-05,0.375195,...,1560,36897,58976,0.003835,0.0,0.003455,0.0,Thrombocytopenia,Decreased mean corpuscular hemoglobin concentr...,225.832391
1,HP:0001626,HP:0002904,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.755187,0.492414,0.315158,...,1934,42394,58976,0.005199,0.0,0.002648,0.0,Abnormality of the cardiovascular system,Hyperbilirubinemia,41.720508
2,HP:0002904,HP:0500015,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.753112,0.504214,0.31438,...,1934,42394,58976,0.002648,0.0,0.005199,0.0,Hyperbilirubinemia,Abnormal cardiac test,41.720508
3,HP:0002904,HP:0500020,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.764328,0.505511,0.315677,...,1934,42394,58976,0.002648,0.0,0.005199,0.0,Hyperbilirubinemia,Abnormal cardiac biomarker test,41.720508
4,HP:0003573,HP:0020058,0.012599,0.000979,0.000954,2.522429e-05,0.0,0.0,0.0,0.15048,...,281,13646,58976,0.002829,0.0,0.008816,0.0,Increased total bilirubin,Abnormal red blood cell count,38.818698


In [33]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_textHpo_textHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001541,HP:0001677,0.000592,1.1e-05,1.1e-05,3.757829e-08,0.0,0.024731,0.000202,0.024731,...,540,52138,58976,0.000508,0.0,7.276097e-05,0.007056,Ascites,Coronary artery atherosclerosis,289.755148
1,HP:0001677,HP:0001744,0.000257,1.6e-05,1.6e-05,7.143215e-08,0.0,0.007124,6.7e-05,0.023656,...,603,54901,58976,7.3e-05,0.007392,0.0001682294,0.000134,Coronary artery atherosclerosis,Splenomegaly,222.926598
2,HP:0001297,HP:0001394,0.000527,8.7e-05,8.5e-05,1.436167e-06,0.0,0.001277,0.00047,0.997849,...,553,52463,58976,1.9e-05,0.057728,0.0004222426,0.0,Stroke,Cirrhosis,60.401237
3,HP:0001677,HP:0002013,0.000457,6.4e-05,6.2e-05,2.231312e-06,0.0,0.013172,0.000269,0.993347,...,582,53896,58976,7.3e-05,0.007392,0.0003220032,0.0,Coronary artery atherosclerosis,Vomiting,28.746136
4,HP:0001399,HP:0100749,0.000119,2.4e-05,2.3e-05,1.284525e-06,0.00289,0.010148,0.000134,1.0,...,598,54673,58976,9e-05,0.00168,6.123382e-06,0.274731,Hepatic failure,Chest pain,18.601272
5,HP:0001677,HP:0001945,0.000158,5.9e-05,4.9e-05,1.020238e-05,0.000538,0.021237,0.000269,0.021976,...,552,47426,58976,7.3e-05,0.007392,3.598422e-05,0.037903,Coronary artery atherosclerosis,Fever,5.780615
6,HP:0001297,HP:0100749,6.6e-05,4.9e-05,4e-05,8.854251e-06,0.022782,0.016196,6.7e-05,0.016196,...,583,51877,58976,1.9e-05,0.057728,6.123382e-06,0.274731,Stroke,Chest pain,5.535079
7,HP:0012444,HP:0100749,2.3e-05,2e-05,1.6e-05,4.54418e-06,0.779704,0.008333,6.7e-05,0.008333,...,601,54035,58976,1e-06,0.140524,6.123382e-06,0.274731,Brain atrophy,Chest pain,4.421684
8,HP:0100749,HP:0100806,2.9e-05,3e-05,2.3e-05,7.072709e-06,0.137634,0.012634,0.000202,0.012634,...,592,53068,58976,6e-06,0.278898,7.249043e-09,0.045027,Chest pain,Sepsis,4.262483
9,HP:0001297,HP:0001744,0.000232,6.1e-05,4.4e-05,1.715005e-05,0.0,0.015054,6.7e-05,0.015121,...,583,53455,58976,1.9e-05,0.057728,0.0001682294,0.000134,Stroke,Splenomegaly,3.574789


In [9]:
# save data for cytoscape
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Rad_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/edges_textHpo_textHpo_{}.csv'.format(disease))

In [10]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Rad', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/nodes_textHpo_textHpo_{}.csv'.format(disease))

### Just look at primary diagnosis


In [4]:
mf_all = pd.read_csv('mutual_info_textHpo_labHpo.csv')
mf_all.head()

Unnamed: 0,P1,P2,entropy_P1,entropy_P2,mf_P1_P2
0,HP:0000001,HP:0000118,0.367357,0.142722,0.011065
1,HP:0000001,HP:0000001,0.367357,0.142722,0.011065
2,HP:0000001,HP:0001939,0.367357,0.180924,0.014722
3,HP:0000001,HP:0001871,0.367357,0.20622,0.019528
4,HP:0000001,HP:0001877,0.367357,0.239249,0.017066


In [5]:
with open('synergies_radiology_lab_primary_only.obj', 'rb') as synergies_file:
    synergies_rad_lab_primary_only = pickle.load(synergies_file)

In [6]:
#p_values = load_p_values('p_value_428_primary_only.obj')
synergy = synergies_rad_lab_primary_only['428']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']


In [9]:
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']
include_overall_mf.head()


Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,RAD_RAD_HP:0002202,Lab_HP:0004363,0.002496,Pleural effusion,Abnormal circulating calcium concentration,0.002937,4.7e-05,0.005479,0.966136,0.869158,0.102352,0.9632,0.869111
1,RAD_RAD_HP:0001640,Lab_HP:0004363,0.001359,Cardiomegaly,Abnormal circulating calcium concentration,0.007596,4.7e-05,0.009002,0.819058,0.869158,0.036363,0.811462,0.869111
2,RAD_RAD_HP:0001635,Lab_HP:0004363,0.001114,Congestive heart failure,Abnormal circulating calcium concentration,0.011296,4.7e-05,0.012457,0.420952,0.869158,0.013604,0.409657,0.869111
3,RAD_RAD_HP:0000969,Lab_HP:0004363,0.000966,Edema,Abnormal circulating calcium concentration,0.001782,4.7e-05,0.002795,0.930309,0.869158,0.07126,0.928527,0.869111
4,RAD_RAD_HP:0002086,Lab_HP:0004363,0.000759,Abnormality of the respiratory system,Abnormal circulating calcium concentration,0.00193,4.7e-05,0.002736,0.671973,0.869158,0.080085,0.670043,0.869111


In [11]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-428_corrected.csv', index=False)

In [12]:
#p_values = load_p_values('p_value_584_primary_only.obj')
synergy = synergies_rad_lab_primary_only['584']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']

In [13]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-584_corrected.csv', index=False)

In [6]:
#p_values = load_p_values('p_value_038_primary_only.obj')
p_values = None
synergy = synergies_rad_lab_primary_only['038']
entropy_diag = entropy(synergy.case_N, synergy.control_N)
filtered_data = filtered_synergy_dataframe(synergy, p_values=p_values, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']
include_overall_mf['synergy_norm'] = include_overall_mf['synergy'] / entropy_diag

In [7]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d,synergy_norm
0,RAD_HP:0002107,Lab_HP:0004363,0.001801,Pneumothorax,Abnormal circulating calcium concentration,0.000428,0.017524,0.019753,0.659875,0.869158,0.037493,0.659447,0.851634,0.005676
2,RAD_HP:0002107,Lab_HP:0004360,0.001728,Pneumothorax,Abnormality of acid-base homeostasis,0.000428,0.017165,0.019322,0.659875,0.836642,0.030492,0.659447,0.819477,0.005448
4,RAD_HP:0002107,Lab_HP:0010927,0.001728,Pneumothorax,Abnormal blood inorganic cation concentration,0.000428,0.018203,0.020359,0.659875,0.836345,0.035441,0.659447,0.818143,0.005446
6,RAD_HP:0002107,Lab_HP:0002901,0.001702,Pneumothorax,Hypocalcemia,0.000428,0.017857,0.019987,0.659875,0.898148,0.034895,0.659447,0.880291,0.005363
8,RAD_HP:0002107,Lab_HP:0002151,0.00162,Pneumothorax,Increased serum lactate,0.000428,0.016012,0.01806,0.659875,0.923926,0.021633,0.659447,0.907914,0.005105
10,RAD_HP:0002107,Lab_HP:0032368,0.001547,Pneumothorax,Acidemia,0.000428,0.007687,0.009662,0.659875,0.933695,0.041994,0.659447,0.926008,0.004876
12,RAD_HP:0002107,Lab_HP:0002795,0.001523,Pneumothorax,Functional respiratory abnormality,0.000428,0.006194,0.008145,0.659875,0.997066,0.037979,0.659447,0.990872,0.004799
14,RAD_HP:0002107,Lab_HP:0012415,0.001523,Pneumothorax,Abnormal blood gas level,0.000428,0.006194,0.008145,0.659875,0.997066,0.037979,0.659447,0.990872,0.004799
16,RAD_HP:0002107,Lab_HP:0010929,0.001512,Pneumothorax,Abnormal blood cation concentration,0.000428,0.01762,0.019561,0.659875,0.695458,0.028991,0.659447,0.677838,0.004765
18,RAD_HP:0002107,Lab_HP:0020062,0.001498,Pneumothorax,Decreased hemoglobin concentration,0.000428,0.007413,0.00934,0.659875,0.938389,0.046457,0.659447,0.930976,0.004722


In [37]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-038_corrected.csv', index=False)
edges_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'synergy', 'p', 'mf_d_P1_P2', 'mf_P1_P2']]
edges_cyto.to_csv('synergy_radiology_labtest_primary_only-038-edges.csv')
nodes_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'P1_radiology_label', 'P2_lab_label', 'mf_d_P1', 'mf_d_P2', 'entropy_P1', 'entropy_P2', 'entropy_P1_given_d', 'entropy_P2_given_d']]
nodes_cyto_unique = pd.DataFrame(data={'P': np.concatenate((nodes_cyto.P1, nodes_cyto.P2)), \
                                       'source': np.repeat(['RAD', 'LAB'], len(nodes_cyto)), \
                  'P_label': np.concatenate((nodes_cyto.P1_radiology_label, nodes_cyto.P2_lab_label)), \
                  'mf_d_P': np.concatenate((nodes_cyto.mf_d_P1, nodes_cyto.mf_d_P2)), \
                  'entropy': np.concatenate((nodes_cyto.entropy_P1, nodes_cyto.entropy_P2)), \
                  'conditional_entropy': np.concatenate((nodes_cyto.entropy_P1_given_d, nodes_cyto.entropy_P2_given_d))}).drop_duplicates()
nodes_cyto_unique.to_csv('synergy_radiology_labtest_primary_only-038-nodes.csv')

In [39]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,p,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d,synergy_norm
0,RAD_HP:0002107,Lab_HP:0002151,0.002119,0.0,Pneumothorax,Increased serum lactate,0.000428,0.021219,0.023766,0.659875,0.923926,0.021633,0.659447,0.902707,0.006677
2,RAD_HP:0002107,Lab_HP:0032368,0.001683,0.0,Pneumothorax,Acidemia,0.000428,0.007872,0.009984,0.659875,0.933695,0.041994,0.659447,0.925823,0.005305
4,RAD_HP:0002107,Lab_HP:0002901,0.001509,0.0,Pneumothorax,Hypocalcemia,0.000428,0.01605,0.017988,0.659875,0.898148,0.034895,0.659447,0.882098,0.004757
6,RAD_HP:0002107,Lab_HP:0012418,0.001443,0.0,Pneumothorax,Hypoxemia,0.000428,0.010167,0.012038,0.659875,0.987873,0.035488,0.659447,0.977707,0.004548
8,RAD_HP:0002107,Lab_HP:0020062,0.00139,0.0,Pneumothorax,Decreased hemoglobin concentration,0.000428,0.010041,0.011859,0.659875,0.938389,0.046457,0.659447,0.928348,0.00438
10,RAD_HP:0002107,Lab_HP:0003256,0.001389,0.0,Pneumothorax,Abnormality of the coagulation cascade,0.000428,0.017719,0.019536,0.659875,0.970688,0.026865,0.659447,0.952969,0.004378
12,RAD_HP:0002107,Lab_HP:0004363,0.001377,0.0,Pneumothorax,Abnormal circulating calcium concentration,0.000428,0.01625,0.018055,0.659875,0.869158,0.037493,0.659447,0.852908,0.004339
14,RAD_HP:0002107,Lab_HP:0012200,0.001372,0.0,Pneumothorax,Abnormality of prothrombin,0.000428,0.016642,0.018443,0.659875,0.980079,0.023958,0.659447,0.963437,0.004325
16,RAD_HP:0002107,Lab_HP:0032199,0.001372,0.0,Pneumothorax,Abnormal prothrombin time,0.000428,0.016642,0.018443,0.659875,0.980079,0.023958,0.659447,0.963437,0.004325
18,RAD_HP:0002107,Lab_HP:0008151,0.001362,0.0,Pneumothorax,Prolonged prothrombin time,0.000428,0.015856,0.017647,0.659875,0.992744,0.024537,0.659447,0.976888,0.004293


In [41]:
idx_text = np.where(synergy.vars_labels['set1'] == 'HP:0002107')[0][0]
idx_lab = np.where(synergy.vars_labels['set2'] == 'HP:0002151')[0][0]
synergy.m2[idx_text, idx_lab, :] / np.sum(synergy.m2[idx_text, idx_lab, :])

array([0.00571419, 0.08383071, 0.00203473, 0.07937127, 0.03331864,
       0.21620659, 0.0163965 , 0.56312737])

In [34]:
entropy(10, 10)

1.0

## Synergy among Lab-derived Abnormal Phenotypes

In [50]:
with open('synergies-intra-labHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_labHpo = pickle.load(synergies_file)

In [51]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_labHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0001911,HP:0020064,0.005565,Abnormal granulocyte morphology,Abnormal eosinophil count,0.022505,0.000197,0.028266,0.176359,0.503569,0.000813,0.153855,0.503372
168,HP:0001880,HP:0001911,0.005565,Eosinophilia,Abnormal granulocyte morphology,0.000197,0.022505,0.028266,,,,,
110,HP:0001880,HP:0032309,0.005565,Eosinophilia,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
109,HP:0001879,HP:0032309,0.005565,Abnormal eosinophil morphology,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
169,HP:0002148,HP:0002905,0.004092,Hypophosphatemia,Hyperphosphatemia,0.009924,0.005304,0.01932,,,,,
117,HP:0001880,HP:0010974,0.003704,Eosinophilia,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
116,HP:0001879,HP:0010974,0.003704,Abnormal eosinophil morphology,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
1,HP:0010974,HP:0020064,0.003704,Abnormal myeloid leukocyte morphology,Abnormal eosinophil count,0.026392,0.000197,0.030292,0.183103,0.503569,0.000842,0.156711,0.503372
71,HP:0002904,HP:0011014,0.003596,Hyperbilirubinemia,Abnormal glucose homeostasis,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359
76,HP:0002904,HP:0011015,0.003596,Hyperbilirubinemia,Abnormal blood glucose concentration,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359


In [53]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_labHpo_038_primary_only.csv', index=False)

In [54]:
with open('synergies-intra-textHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_textHpo = pickle.load(synergies_file)

In [55]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_textHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0002107,HP:0002202,0.002421,Pneumothorax,Pleural effusion,0.000428,0.007408,0.010257,,,,,
2,HP:0002107,HP:0100750,0.001727,Pneumothorax,Atelectasis,0.000428,0.003481,0.005637,,,,,
4,HP:0002107,HP:0100806,0.001193,Pneumothorax,Sepsis,0.000428,0.048477,0.050099,,,,,
610,HP:0002090,HP:0002107,0.00116,Pneumonia,Pneumothorax,0.011828,0.000428,0.013417,,,,,
6,HP:0002107,HP:0100598,0.001033,Pneumothorax,Pulmonary edema,0.000428,0.006642,0.008104,,,,,
611,HP:0001640,HP:0002107,0.000739,Cardiomegaly,Pneumothorax,0.003249,0.000428,0.004416,,,,,
612,HP:0000969,HP:0002107,0.000611,Edema,Pneumothorax,0.004273,0.000428,0.005312,,,,,
8,HP:0002107,HP:0002878,0.000597,Pneumothorax,Respiratory failure,0.000428,0.006298,0.007324,,,,,
613,HP:0001945,HP:0002107,0.000516,Fever,Pneumothorax,0.01215,0.000428,0.013094,,,,,
10,HP:0002107,HP:0002835,0.00044,Pneumothorax,Aspiration,0.000428,0.003259,0.004127,,,,,


In [56]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_textHpo_038_primary_only.csv', index=False)