# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
import pickle

In [2]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

# Mutual information without considering diagnosis

In [3]:
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_labHpo.obj', 'rb') as f:
    summary_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_textHpo.obj', 'rb') as f:
    summary_textHpo_textHpo = pickle.load(f)  
with open('../../../data/mf_regardless_of_diseases/summary_labHpo_labHpo.obj', 'rb') as f:
    summary_labHpo_labHpo = pickle.load(f)

In [4]:
mf_textHpo_labHpo = mf.MutualInfoXY(summary_textHpo_labHpo)
mf_textHpo_textHpo = mf.MutualInfoXY(summary_textHpo_textHpo)
mf_labHpo_labHpo = mf.MutualInfoXY(summary_labHpo_labHpo)

## TextHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [128]:
df_mf_textHpo_labHpo = mf_textHpo_labHpo.mf_labeled()
# add labels
df_mf_textHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P1])
df_mf_textHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[df_mf_textHpo_labHpo.P1 < df_mf_textHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_labHpo.P1[i], df_mf_textHpo_labHpo.P2[i]) for i in np.arange(df_mf_textHpo_labHpo.shape[0])])
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
#df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_labHpo.csv')
df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
1968,HP:0002202,HP:0020062,0.12566,Pleural effusion,Decreased hemoglobin concentration
1958,HP:0002202,HP:0020061,0.121226,Pleural effusion,Abnormal hemoglobin concentration
1959,HP:0002202,HP:0011015,0.12101,Pleural effusion,Abnormal blood glucose concentration
1960,HP:0002202,HP:0011014,0.12101,Pleural effusion,Abnormal glucose homeostasis
1962,HP:0002202,HP:0031851,0.117206,Pleural effusion,Reduced hematocrit
1957,HP:0002202,HP:0020058,0.115497,Pleural effusion,Abnormal red blood cell count
1961,HP:0002202,HP:0010929,0.115105,Pleural effusion,Abnormal blood cation concentration
1972,HP:0002202,HP:0004363,0.113134,Pleural effusion,Abnormal circulating calcium concentration
1967,HP:0002202,HP:0010927,0.112096,Pleural effusion,Abnormal blood inorganic cation concentration
1956,HP:0002202,HP:0031850,0.111859,Pleural effusion,Abnormal hematocrit


## TextHpo -- TextHpo
Their mutual information tells how much they correlate with each other.

In [6]:
df_mf_textHpo_textHpo = mf_textHpo_textHpo.mf_labeled()
# add labels
df_mf_textHpo_textHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P1])
df_mf_textHpo_textHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[df_mf_textHpo_textHpo.P1 < df_mf_textHpo_textHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_textHpo.P1[i], df_mf_textHpo_textHpo.P2[i]) for i in np.arange(df_mf_textHpo_textHpo.shape[0])])
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_textHpo.csv')
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
12173,HP:0001892,HP:0011028,0.706285,Abnormal bleeding,Abnormality of blood circulation
5481,HP:0011947,HP:0012649,0.625522,Respiratory tract infection,Increased inflammatory response
5480,HP:0011947,HP:0012647,0.625481,Respiratory tract infection,Abnormal inflammatory response
3229,HP:0010978,HP:0011947,0.565861,Abnormality of immune system physiology,Respiratory tract infection
11385,HP:0011024,HP:0025033,0.543392,Abnormality of the gastrointestinal tract,Abnormality of digestive system morphology
2329,HP:0002715,HP:0011947,0.481571,Abnormality of the immune system,Respiratory tract infection
1565,HP:0000969,HP:0002103,0.455301,Edema,Abnormality of the pleura
2876,HP:0002103,HP:0011032,0.451888,Abnormality of the pleura,Abnormality of fluid regulation
12690,HP:0011029,HP:0100659,0.450224,Internal hemorrhage,Abnormality of the cerebral vasculature
12524,HP:0011028,HP:0100659,0.447784,Abnormality of blood circulation,Abnormality of the cerebral vasculature


## LabHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [115]:
df_mf_labHpo_labHpo = mf_labHpo_labHpo.mf_labeled()
# add labels
df_mf_labHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P1])
df_mf_labHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[df_mf_labHpo_labHpo.P1 < df_mf_labHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_labHpo_labHpo.P1[i], df_mf_labHpo_labHpo.P2[i]) for i in np.arange(df_mf_labHpo_labHpo.shape[0])])
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_labHpo_labHpo.csv')
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
0,HP:0002157,HP:0031970,0.744493,Azotemia,Abnormal blood urea nitrogen concentration
1,HP:0020061,HP:0031850,0.575781,Abnormal hemoglobin concentration,Abnormal hematocrit
2,HP:0020062,HP:0031851,0.546701,Decreased hemoglobin concentration,Reduced hematocrit
3,HP:0020058,HP:0020061,0.543036,Abnormal red blood cell count,Abnormal hemoglobin concentration
4,HP:0020058,HP:0031850,0.533428,Abnormal red blood cell count,Abnormal hematocrit
5,HP:0020060,HP:0020062,0.514094,Decreased red blood cell count,Decreased hemoglobin concentration
6,HP:0500164,HP:0500165,0.499272,Abnormal blood carbon dioxide level,Abnormal blood oxygen level
7,HP:0020060,HP:0031851,0.48535,Decreased red blood cell count,Reduced hematocrit
8,HP:0020061,HP:0031851,0.433088,Abnormal hemoglobin concentration,Reduced hematocrit
9,HP:0001882,HP:0004332,0.417853,Leukopenia,Abnormal lymphocyte morphology


## Mutual information between textHpo and labHpo in respect to diagnoses

## Only consider primary diagnosis

In [129]:
def mf_dataframes(mf_diagnosis_phenotypes, p_mf_Xz, p_mf_Yz, p_mf_XY_z, p_mf_XY_given_z, p_synergy, p_mf_XY_omit_z):
    X_labels, Y_labels = mf_diagnosis_phenotypes.vars_labels.values()
    M1 = len(X_labels)
    M2 = len(Y_labels)

    mf_Xz = mf_diagnosis_phenotypes.mutual_info_Xz()
    mf_Yz = mf_diagnosis_phenotypes.mutual_info_Yz()

    # mutual information between single phenotypes and diagnosis
    df_mf_Xz = pd.DataFrame(data={'X': X_labels, 'mf_Xz': mf_Xz})
    df_mf_Yz = pd.DataFrame(data={'Y': Y_labels, 'mf_Yz': mf_Yz})
    
    # joint and conditional mutual information, and synergy
    mf_XY_z = mf_diagnosis_phenotypes.mutual_info_XY_z()
    mf_XY_given_z = mf_diagnosis_phenotypes.mutual_info_XY_given_z()
    mf_synergy = mf_diagnosis_phenotypes.synergy_XY2z()
    
    # mutual information between phenotypes without considering diagnosis
    mf_XY_omit_z = mf_diagnosis_phenotypes.mutual_info_XY_omit_z()
    
    # mutual information between phenotype pairs and diagnosis
    df_mf_XY_z = pd.DataFrame()
    df_mf_XY_z['X'] = np.repeat(X_labels, M2)
    df_mf_XY_z['Y'] = np.tile(Y_labels, [M1])
    df_mf_XY_z['mf_XY_z'] = mf_XY_z.flat
    df_mf_XY_z['mf_XY_given_z'] = mf_XY_given_z.flat
    df_mf_XY_z['synergy'] = mf_synergy.flat
    
    # mutual information between phenotypes after omiting diagnosis
    df_mf_XY_z['mf_XY_omit_z'] = mf_XY_omit_z.flat
    
    # add p values
    if p_mf_Xz is not None:
        df_mf_Xz['p_mf_Xz'] = p_mf_Xz
    if p_mf_Yz is not None:
        df_mf_Yz['p_mf_Yz'] = p_mf_Yz
    if p_mf_XY_z is not None:
        df_mf_XY_z['p_mf_XY_z'] = p_mf_XY_z.flat
    if p_mf_XY_given_z is not None:
        df_mf_XY_z['p_mf_XY_given_z'] = p_mf_XY_given_z.flat
    if p_synergy is not None:
        df_mf_XY_z['p_synergy'] = p_synergy.flat
    if p_mf_XY_omit_z is not None:
        df_mf_XY_z['p_mf_XY_omit_z'] = p_mf_XY_omit_z.flat

    return df_mf_Xz, df_mf_Yz, df_mf_XY_z

def filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z):
    df_merged = df_mf_XY_z \
        .merge(df_mf_Xz, how='left', on=['X']) \
        .merge(df_mf_Yz, how='left', on=['Y'])

    ## filter out identifical pairs: a, b is the same as b, a
    df_filtered = df_merged.loc[df_merged.X < df_merged.Y, :].reset_index(drop=True)
    mask = np.array([hpo.has_dependency(df_filtered.X[i], df_filtered.Y[i]) for i in np.arange(len(df_filtered))])
    df_filtered = df_filtered.loc[np.logical_not(mask), ].reset_index(drop=True)
    return df_filtered

def entropy(case, control):
    total = case + control 
    h = -(case / total * np.log2(case/total) + control/total * np.log2(control/total))
    return h
    

def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

In [52]:
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_textHpo_labHpo.obj', 'rb') as f:
    summaries_diagnosis_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_textHpo_textHpo.obj', 'rb') as f:
    summaries_diagnosis_textHpo_textHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/primary_only/summaries_diagnosis_labHpo_labHpo.obj', 'rb') as f:
    summaries_diagnosis_labHpo_labHpo = pickle.load(f)

### textHpo-labHpo pairs

In [130]:
disease = '038'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/primary_only/{}/p_value_textHpo_textHpo_038_primary_only.obj'.format(disease))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [131]:
df_mf_textHpo_labHpo.head()

Unnamed: 0,P1,P2,mf,P1_label,P2_label
0,HP:0002086,HP:0032180,0.108733,Abnormality of the respiratory system,Abnormal circulating metabolite concentration
1,HP:0002086,HP:0012337,0.110931,Abnormality of the respiratory system,Abnormal homeostasis
2,HP:0002086,HP:0003111,0.099472,Abnormality of the respiratory system,Abnormal blood ion concentration
3,HP:0002086,HP:0031850,0.102414,Abnormality of the respiratory system,Abnormal hematocrit
4,HP:0002086,HP:0020058,0.102433,Abnormality of the respiratory system,Abnormal red blood cell count


In [107]:
# mutual information between textHpo and diagnosis
df_mf_textHpo_diagnosis = df_mf_Xz \
    .assign(X_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Xz.X])) \
    .sort_values(by='mf_Xz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P', 'mf_Xz': 'mf_P_diag', 'p_mf_Xz': 'p_mf_P_diag', 'X_label': 'P_label'})

#df_mf_textHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/primary_only/mf_textHpo_diag_{}.csv'.format(disease))
df_mf_textHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0100806,0.048477,0.0,Sepsis
1,HP:0001945,0.01215,0.0,Fever
2,HP:0002615,0.011971,0.0,Hypotension
3,HP:0002090,0.011828,0.0,Pneumonia
4,HP:0031273,0.008683,0.0,Shock


In [102]:
# mutual information between labHpo and diagnosis
df_mf_labHpo_diagnosis = df_mf_Yz \
    .assign(Y_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Yz.Y])) \
    .sort_values(by='mf_Yz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'Y': 'P', 'mf_Yz': 'mf_P_diag', 'p_mf_Yz': 'p_mf_P_diag', 'Y_label': 'P_label'})

#df_mf_labHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/primary_only/mf_labHpo_diag_{}.csv'.format(disease))
df_mf_labHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0012614,0.03057,0.0,Abnormal urine cytology
1,HP:0000119,0.030042,0.0,Abnormality of the genitourinary system
2,HP:0000079,0.030042,0.0,Abnormality of the urinary system
3,HP:0011277,0.030042,0.0,Abnormality of the urinary system physiology
4,HP:0003110,0.030042,0.0,Abnormality of urine homeostasis


In [137]:
# save synergies
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.rename(columns={'X':'P1', 'Y': 'P2'}) \
    .sort_values(by=['synergy'], ascending=False) \
    .reset_index(drop=True) \
    .loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'synergy', 'p_mf_Xz', 'p_mf_Yz', 'p_synergy', 'P1_label', 'P2_label']] \
    .rename(columns={'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_textHpo_labHpo_{}.csv'.format(disease))

In [139]:
# save ratios between conditional mutual info and overall mutual info
df_mf_vs_conditional_mf = df_mf_XY_z_filtered \
    .assign(mf_ratio=df_mf_XY_z_filtered.mf_XY_given_z/df_mf_XY_z_filtered.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2'})
df_mf_vs_conditional_mf \
    .loc[:, ['P1', 'P2', 'mf_XY_omit_z', 'mf_XY_given_z', 'mf_ratio','synergy', 'p_mf_XY_omit_z', 'p_mf_XY_given_z', 'p_synergy', 'P1_label', 'P2_label']] \
    .rename(columns={'mf_XY_given_z': 'mf_P1P2_given_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_textHpo_labHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head(n=10)

Unnamed: 0,P1,P2,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0002097,HP:0031962,0.003477,1.9e-05,1.9e-05,1.691589e-07,0.0,0.997649,0.082297,0.331842,9.916451e-07,0.391914,0.003457,0.0,Emphysema,Elevated serum anion gap,113.917167
1,HP:0002107,HP:0002904,0.003212,0.00016,0.000135,2.445166e-05,0.0,0.000834,0.000531,0.440003,0.0004283993,0.0,0.002648,0.0,Pneumothorax,Hyperbilirubinemia,6.523978
2,HP:0002107,HP:0410174,0.006334,0.000225,0.000167,5.745239e-05,0.0,0.0,0.000228,0.035042,0.0004283993,0.0,0.005738,0.0,Pneumothorax,Increased troponin T level in blood,3.914375
3,HP:0001394,HP:0001626,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.003186,0.002048,0.119691,0.001630781,0.0,0.005199,0.0,Cirrhosis,Abnormality of the cardiovascular system,3.808927
4,HP:0001394,HP:0500015,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.003034,0.003565,0.11294,0.001630781,0.0,0.005199,0.0,Cirrhosis,Abnormal cardiac test,3.808927
5,HP:0001394,HP:0500020,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.0022,0.002275,0.11696,0.001630781,0.0,0.005199,0.0,Cirrhosis,Abnormal cardiac biomarker test,3.808927
6,HP:0002107,HP:0005518,0.003412,7.3e-05,4.6e-05,2.683312e-05,0.0,0.035953,0.021541,0.136833,0.0004283993,0.0,0.002937,0.0,Pneumothorax,Increased mean corpuscular volume,2.725378
7,HP:0001394,HP:0410174,0.00744,0.000142,7.2e-05,7.071828e-05,0.0,0.002579,0.007812,0.022679,0.001630781,0.0,0.005738,0.0,Cirrhosis,Increased troponin T level in blood,2.011113
8,HP:0002097,HP:0003259,0.011764,1.3e-05,6e-06,6.247333e-06,0.0,0.538835,0.456538,0.743249,9.916451e-07,0.391914,0.011756,0.0,Emphysema,Elevated serum creatinine,2.006758
9,HP:0002107,HP:0003573,0.003385,0.000257,0.000128,0.0001291306,0.0,0.0,0.001062,0.000986,0.0004283993,0.0,0.002829,0.0,Pneumothorax,Increased total bilirubin,1.987725


In [120]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered
    .merge(df_mf_textHpo_labHpo.loc[:, ['P1', 'P2', 'mf', 'P1_label', 'P2_label']] \
            .rename(columns={'mf': 'mf_overall'}), left_on=['X', 'Y'], right_on = ['P1', 'P2'], how = 'left')
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .loc[:, ['P1', 'P2', 'mf_XY_omit_z', 'mf_XY_given_z', 'mf_ratio','synergy', 'p_mf_XY_omit_z', 'p_mf_XY_given_z', 'p_synergy', 'P1_label', 'P2_label']] \
    .rename(columns={'mf_XY_given_z': 'mf_P1P2_given_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_textHpo_labHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head(n=10)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,mf_ratio
0,HP:0002097,HP:0031962,0.003477,1.9e-05,1.9e-05,1.691589e-07,0.0,0.997649,0.082297,0.331842,9.916451e-07,0.391914,0.003457,0.0,20553.006879
1,HP:0002097,HP:0003259,0.011764,1.3e-05,6e-06,6.247333e-06,0.0,0.538835,0.456538,0.743249,9.916451e-07,0.391914,0.011756,0.0,1883.007111
2,HP:0001394,HP:0001626,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.003186,0.002048,0.119691,0.001630781,0.0,0.005199,0.0,210.954076
3,HP:0001394,HP:0500020,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.0022,0.002275,0.11696,0.001630781,0.0,0.005199,0.0,210.954076
4,HP:0001394,HP:0500015,0.006922,0.000125,9.2e-05,3.281417e-05,0.0,0.003034,0.003565,0.11294,0.001630781,0.0,0.005199,0.0,210.954076
5,HP:0002107,HP:0002904,0.003212,0.00016,0.000135,2.445166e-05,0.0,0.000834,0.000531,0.440003,0.0004283993,0.0,0.002648,0.0,131.341275
6,HP:0002107,HP:0005518,0.003412,7.3e-05,4.6e-05,2.683312e-05,0.0,0.035953,0.021541,0.136833,0.0004283993,0.0,0.002937,0.0,127.156505
7,HP:0002107,HP:0410174,0.006334,0.000225,0.000167,5.745239e-05,0.0,0.0,0.000228,0.035042,0.0004283993,0.0,0.005738,0.0,110.239646
8,HP:0002097,HP:0003573,0.002854,5.1e-05,2.4e-05,2.670975e-05,0.0,0.11074,0.073043,0.124242,9.916451e-07,0.391914,0.002829,0.0,106.863586
9,HP:0001394,HP:0003236,0.003254,3.9e-05,9e-06,3.06872e-05,0.0,0.20669,0.990974,0.12189,0.001630781,0.0,0.001614,0.0,106.024281


In [84]:
df_4_cytoscape = df_mf_XY_z_filtered.merge(df_mf_textHpo_labHpo, how = 'left', left_on=['X', 'Y'], right_on=['P1', 'P2']) \
    .rename(columns={'mf': 'mf_overall'}) \
    .sort_values(by=['synergy'], ascending=False) \
    .reset_index(drop=True) \

df_4_cytoscape.head()
    #.loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'P1_label', 'P2_label']] \
    
n = math.floor(len(df_4_cytoscape) * 0.01)
# edges
df_4_cytoscape \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .head(n = n) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/edges_textHpo_labHpo_{}.csv'.format(disease))
# nodes


In [85]:
node_labels = df_4_cytoscape \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .loc[:, ['P1', 'P2', 'P1_label', 'P2_label']] \
    .head(n = n)
nodes = pd.DataFrame()
nodes['term_id'] = np.concatenate([node_labels.P1, node_labels.P2])
nodes['term_labels'] = np.concatenate([node_labels.P1_label, node_labels.P2_label])
nodes['type'] = np.repeat(["Rad", "Lab"], len(node_labels))
nodes.drop_duplicates().reset_index(drop=True).head()
nodes.drop_duplicates().reset_index(drop=True).to_csv('../../../data/mf_regarding_diseases/primary_only/cytoscape/nodes_textHpo_labHpo_{}.csv'.format(disease))

### labHpo-labHpo pairs

In [121]:
disease = '038'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_labHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/primary_only/{}/p_value_labHpo_labHpo_038_primary_only.obj'.format(disease))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [123]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
#df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True)
    #.rename(columns={'mf_joint': 'mf_P1P2_diag'}) \
    #.loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.loc[:, ['X', 'Y', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'p_mf_Xz', 'p_mf_Yz', 'P1_label', 'P2_label']] \
    .rename(columns={'X': 'P1', 'Y': 'P2', 'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_joint_vs_individual_labHpo_labHpo_{}.csv'.format(disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0010974,HP:0012614,0.041284,0.075577,-0.011386,0.086964,0.0,0.0,0.0,0.0,0.0221,0.0,0.03057,0.0,Abnormal myeloid leukocyte morphology,Abnormal urine cytology
1,HP:0001874,HP:0012614,0.040905,0.045185,-0.007999,0.053184,0.0,0.0,0.0,0.0,0.018334,0.0,0.03057,0.0,Abnormality of neutrophils,Abnormal urine cytology
2,HP:0011991,HP:0012614,0.040905,0.045185,-0.007999,0.053184,0.0,0.0,0.0,0.0,0.018334,0.0,0.03057,0.0,Abnormal neutrophil count,Abnormal urine cytology
3,HP:0012614,HP:0032236,0.040905,0.045185,-0.007999,0.053184,0.0,0.0,0.0,0.0,0.03057,0.0,0.018334,0.0,Abnormal urine cytology,Increased circulating immature neutrophil count
4,HP:0000079,HP:0010974,0.040803,0.073298,-0.011339,0.084637,0.0,0.0,0.0,0.0,0.030042,0.0,0.0221,0.0,Abnormality of the urinary system,Abnormal myeloid leukocyte morphology


In [46]:
entropy(mf_diagnosis_phenotypes.case_N, mf_diagnosis_phenotypes.control_N) 

0.3172947974778557

In [94]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_labHpo_labHpo_{}.csv'.format(disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,p_mf_XY_z,p_mf_XY_given_z,p_synergy,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
1358,HP:0020062,HP:0020063,0.011461,0.010987,0.004039,0.0,0.0,0.0,0.007413,0.0,9e-06,0.883104,Decreased hemoglobin concentration,Increased hemoglobin concentration
5399,HP:0012419,HP:0500164,0.010003,0.277064,0.003415,0.0,0.0,0.0,7e-06,0.787928,0.00658,0.0,Hyperoxemia,Abnormal blood carbon dioxide level
4874,HP:0001941,HP:0012419,0.019428,0.072473,0.003409,0.0,0.0,0.0,0.016012,0.0,7e-06,0.795643,Acidosis,Hyperoxemia
4974,HP:0002151,HP:0012419,0.019428,0.072473,0.003409,0.0,0.0,0.0,0.016012,0.0,7e-06,0.795643,Increased serum lactate,Hyperoxemia
4786,HP:0012417,HP:0012419,0.012985,0.149472,0.003316,0.0,0.0,0.0,0.009662,0.0,7e-06,0.795643,Hypocapnia,Hyperoxemia
5405,HP:0012419,HP:0032368,0.010558,0.174678,0.002864,0.0,0.0,0.0,7e-06,0.787928,0.007687,0.0,Hyperoxemia,Acidemia
3555,HP:0020063,HP:0031851,0.010694,0.041159,0.002584,0.0,0.0,0.0,9e-06,0.89069,0.008101,0.0,Increased hemoglobin concentration,Reduced hematocrit
1090,HP:0004360,HP:0012419,0.019735,0.160065,0.002563,0.0,0.0,0.0,0.017165,0.0,7e-06,0.795643,Abnormality of acid-base homeostasis,Hyperoxemia
1613,HP:0020060,HP:0020063,0.010415,0.013912,0.002498,0.0,0.0,0.0,0.007907,0.0,9e-06,0.883104,Decreased red blood cell count,Increased hemoglobin concentration
3843,HP:0012418,HP:0012419,0.010168,0.163692,0.002441,0.0,0.0,0.0,0.007719,0.0,7e-06,0.795643,Hypoxemia,Hyperoxemia


In [124]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001939,HP:0020061,0.013182,0.199813,-0.00609,0.205904,0.0,0.0,0.0,0.0,0.008728,0.0,0.010544,0.0,Abnormality of metabolism/homeostasis,Abnormal hemoglobin concentration
1,HP:0001939,HP:0020058,0.011988,0.201278,-0.005557,0.206834,0.0,0.0,0.0,0.0,0.008728,0.0,0.008816,0.0,Abnormality of metabolism/homeostasis,Abnormal red blood cell count
2,HP:0001939,HP:0031850,0.011544,0.198075,-0.005303,0.203378,0.0,0.0,0.0,0.0,0.008728,0.0,0.008118,0.0,Abnormality of metabolism/homeostasis,Abnormal hematocrit
3,HP:0001939,HP:0002715,0.026861,0.07272,-0.005697,0.078417,0.0,0.0,0.0,0.0,0.008728,0.0,0.023831,0.0,Abnormality of metabolism/homeostasis,Abnormality of the immune system
4,HP:0001939,HP:0011893,0.02689,0.072282,-0.005684,0.077966,0.0,0.0,0.0,0.0,0.008728,0.0,0.023846,0.0,Abnormality of metabolism/homeostasis,Abnormal leukocyte count


In [127]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .loc[:, ['X', 'Y', 'mf_XY_omit_z', 'mf_XY_given_z', 'p_mf_XY_omit_z', 'p_mf_XY_given_z', 'mf_ratio', 'P1_label', 'P2_label']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_labHpo_labHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001873,HP:0025547,0.007482,0.000192,0.000191,8.506363e-07,0.0,0.000389,6.5e-05,0.375195,0.003835,0.0,0.003455,0.0,Thrombocytopenia,Decreased mean corpuscular hemoglobin concentr...,225.832391
1,HP:0001626,HP:0002904,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.755187,0.492414,0.315158,0.005199,0.0,0.002648,0.0,Abnormality of the cardiovascular system,Hyperbilirubinemia,41.720508
2,HP:0002904,HP:0500015,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.753112,0.504214,0.31438,0.002648,0.0,0.005199,0.0,Hyperbilirubinemia,Abnormal cardiac test,41.720508
3,HP:0002904,HP:0500020,0.007867,2e-05,2e-05,4.838742e-07,0.0,0.764328,0.505511,0.315677,0.002648,0.0,0.005199,0.0,Hyperbilirubinemia,Abnormal cardiac biomarker test,41.720508
4,HP:0003573,HP:0020058,0.012599,0.000979,0.000954,2.522429e-05,0.0,0.0,0.0,0.15048,0.002829,0.0,0.008816,0.0,Increased total bilirubin,Abnormal red blood cell count,38.818698


### textHpo-textHpo pairs

In [52]:
disease = '038'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_textHpo[disease])

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dateframes(mf_diagnosis_phenotypes)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [53]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.P1])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.P2])
df_mf_XY_z_filtered.sort_values(by='mf_joint', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_joint', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'mf_joint': 'mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'mf_P1P2_diag', 'P1_label', 'P2_label']]

#df_mf_joint_vs_individual.to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_joint_vs_individual_labHpo_labHpo_{}.csv'.format(disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,P1,P2,mf_P1_diag,mf_P2_diag,mf_P1P2_diag,P1_label,P2_label
0,HP:0002615,HP:0100806,0.011971,0.048477,0.05362,Hypotension,Sepsis
1,HP:0001945,HP:0100806,0.01215,0.048477,0.053482,Fever,Sepsis
2,HP:0002090,HP:0100806,0.011828,0.048477,0.052257,Pneumonia,Sepsis
3,HP:0031864,HP:0100806,0.007931,0.048477,0.051703,Bacteremia,Sepsis
4,HP:0031273,HP:0100806,0.008683,0.048477,0.051284,Shock,Sepsis


In [54]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'P1_label', 'P2_label']] \
    .to_csv('../../../data/mf_regarding_diseases/primary_only/df_synergy_textHpo_textHpo_{}.csv'.format(disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,P1,P2,mf_joint,mf_conditional,synergy,mf_P1_diag,mf_P2_diag,P1_label,P2_label
7049,HP:0002107,HP:0002202,0.010257,0.102225,0.002421,0.000428,0.007408,Pneumothorax,Pleural effusion
7047,HP:0002107,HP:0100750,0.005637,0.103158,0.001727,0.000428,0.003481,Pneumothorax,Atelectasis
7076,HP:0002107,HP:0100806,0.050099,0.00287,0.001193,0.000428,0.048477,Pneumothorax,Sepsis
1836,HP:0002090,HP:0002107,0.013417,0.016823,0.00116,0.011828,0.000428,Pneumonia,Pneumothorax
7064,HP:0002107,HP:0100598,0.008104,0.024275,0.001033,0.000428,0.006642,Pneumothorax,Pulmonary edema
3816,HP:0001640,HP:0002107,0.004416,0.029927,0.000739,0.003249,0.000428,Cardiomegaly,Pneumothorax
470,HP:0000969,HP:0002107,0.005312,0.026408,0.000611,0.004273,0.000428,Edema,Pneumothorax
7107,HP:0002107,HP:0002878,0.007324,0.006544,0.000597,0.000428,0.006298,Pneumothorax,Respiratory failure
3959,HP:0001945,HP:0002107,0.013094,0.00443,0.000516,0.01215,0.000428,Fever,Pneumothorax
7078,HP:0002107,HP:0002835,0.004127,0.012274,0.00044,0.000428,0.003259,Pneumothorax,Aspiration


In [55]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered \
    .merge(df_mf_labHpo_labHpo.loc[:, ['P1', 'P2', 'mf']] \
            .rename(columns={'mf': 'mf_overall'}), on = ['P1', 'P2'], how = 'left')
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_conditional/df_mf_vs_conditional_mf.mf_overall) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf.loc[:, ['P1', 'P2', 'mf_overall', 'mf_conditional', 'mf_ratio', 'P1_label', 'P2_label']].to_csv('../../../data/mf_regarding_diseases/primary_only/df_mf_ratio_textHpo_textHpo_{}.csv'.format(disease))
df_mf_vs_conditional_mf.head(n=20)

Unnamed: 0,P1,P2,mf_joint,mf_conditional,synergy,mf_P1_diag,mf_P2_diag,P1_label,P2_label,mf_overall,mf_ratio
0,HP:0001871,HP:0010935,0.002212,0.057406,-0.000507,0.001183,0.001536,Abnormality of blood and blood-forming tissues,Abnormality of the upper urinary tract,0.003004,19.107635
1,HP:0000077,HP:0001871,0.002282,0.056714,-0.000537,0.001636,0.001183,Abnormality of the kidney,Abnormality of blood and blood-forming tissues,0.003004,18.87729
2,HP:0001626,HP:0001871,0.002357,0.123085,-0.000775,0.001949,0.001183,Abnormality of the cardiovascular system,Abnormality of blood and blood-forming tissues,0.008298,14.833288
3,HP:0001871,HP:0012211,0.002051,0.039039,-0.000479,0.001183,0.001347,Abnormality of blood and blood-forming tissues,Abnormal renal physiology,0.003004,12.994205
4,HP:0000818,HP:0001871,0.001585,0.02142,-0.00022,0.000622,0.001183,Abnormality of the endocrine system,Abnormality of blood and blood-forming tissues,0.001865,11.487598
5,HP:0001626,HP:0025031,0.002577,0.085178,-0.000813,0.001949,0.001442,Abnormality of the cardiovascular system,Abnormality of the digestive system,0.011093,7.678759
6,HP:0001871,HP:0025031,0.002019,0.068752,-0.000606,0.001183,0.001442,Abnormality of blood and blood-forming tissues,Abnormality of the digestive system,0.009852,6.978418
7,HP:0001939,HP:0010935,0.003223,0.051591,-0.000765,0.002453,0.001536,Abnormality of metabolism/homeostasis,Abnormality of the upper urinary tract,0.007846,6.575722
8,HP:0000077,HP:0001939,0.003302,0.050514,-0.000786,0.001636,0.002453,Abnormality of the kidney,Abnormality of metabolism/homeostasis,0.007846,6.438485
9,HP:0001626,HP:0002012,0.00242,0.067943,-0.000653,0.001949,0.001124,Abnormality of the cardiovascular system,Abnormality of the abdominal organs,0.011093,6.125025


In [None]:
with open('synergies_radiology_lab_primary_and_secondary.obj', 'rb') as synergies_file:
    synergies_rad_lab = pickle.load(synergies_file)

In [None]:
len(synergies_rad_lab)

In [3]:
def entropy(case, control):
    total = case + control 
    h = -(case / total * np.log2(case/total) + control/total * np.log2(control/total))
    return h
    

def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

def filtered_synergy_dataframe(synergy, p_values=None, percentile_cut=None):
    if p_values is not None:
        data = synergy.pairwise_synergy_labeled_with_p_values(p_values)
    else:
        data = synergy.pairwise_synergy_labeled()
    # remove directly dependent terms 
    mask = np.array([hpo.has_dependency(data.P1[i], data.P2[i]) for i in np.arange(data.shape[0])])
    data_filtered = data.loc[np.logical_not(mask), :].sort_values(by = 'synergy', ascending=False)
    data_filtered = data_filtered.loc[data_filtered.P1 < data_filtered.P2, :]
    data_filtered['P1_radiology_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P1])
    data_filtered['P2_lab_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in data_filtered.P2])
    
    
    if (percentile_cut == None):
        percentile = 1
    else:
        percentile = percentile_cut / 100
    top_percentile = data_filtered.iloc[0:math.ceil(percentile * len(data_filtered)), :]
    
    
    I, II = synergy.mutual_information()
    Ia, Ib = I.values()
    p1_labels, p2_labels = synergy.vars_labels.values()
    mf_P1 = pd.DataFrame(data={'P1': p1_labels, 'mf_d_P1': Ia})
    mf_P2 = pd.DataFrame(data={'P2': p2_labels, 'mf_d_P2': Ib})
    mf_d_P1P2 = pd.DataFrame(data={'P1': np.repeat(p1_labels, len(p2_labels)), 'P2': np.tile(p2_labels, [len(p1_labels)]), 'mf_d_P1P2': II.flat})
    fully_labeled = top_percentile.merge(mf_P1, on='P1').merge(mf_P2, on='P2').merge(mf_d_P1P2, on=['P1', 'P2'])
    
    return fully_labeled

In [None]:
#p_values = load_p_values('p_value_428.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '428', icd_label = 'heart_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-428_primary_and_secondary.csv', index=False)

In [None]:
#p_value = load_p_values('p_value_584.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '584', icd_label = 'acute_renal_failure', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology-labtest-584_primary_and_secondary.csv', index=False)

In [None]:
#p_values = load_p_values('p_value_038.obj')
filtered_data = filtered_synergy_dataframe(synergies_rad_lab, icd = '038', icd_label = 'sepsis', p_values=None, percentile_cut = 5)
filtered_data.to_csv('synergy-radiology_labtest-038_primary_and_secondary.csv', index=False)

### Just look at primary diagnosis


In [4]:
mf_all = pd.read_csv('mutual_info_textHpo_labHpo.csv')
mf_all.head()

Unnamed: 0,P1,P2,entropy_P1,entropy_P2,mf_P1_P2
0,HP:0000001,HP:0000118,0.367357,0.142722,0.011065
1,HP:0000001,HP:0000001,0.367357,0.142722,0.011065
2,HP:0000001,HP:0001939,0.367357,0.180924,0.014722
3,HP:0000001,HP:0001871,0.367357,0.20622,0.019528
4,HP:0000001,HP:0001877,0.367357,0.239249,0.017066


In [5]:
with open('synergies_radiology_lab_primary_only.obj', 'rb') as synergies_file:
    synergies_rad_lab_primary_only = pickle.load(synergies_file)

In [6]:
#p_values = load_p_values('p_value_428_primary_only.obj')
synergy = synergies_rad_lab_primary_only['428']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']


In [9]:
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']
include_overall_mf.head()


Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,RAD_RAD_HP:0002202,Lab_HP:0004363,0.002496,Pleural effusion,Abnormal circulating calcium concentration,0.002937,4.7e-05,0.005479,0.966136,0.869158,0.102352,0.9632,0.869111
1,RAD_RAD_HP:0001640,Lab_HP:0004363,0.001359,Cardiomegaly,Abnormal circulating calcium concentration,0.007596,4.7e-05,0.009002,0.819058,0.869158,0.036363,0.811462,0.869111
2,RAD_RAD_HP:0001635,Lab_HP:0004363,0.001114,Congestive heart failure,Abnormal circulating calcium concentration,0.011296,4.7e-05,0.012457,0.420952,0.869158,0.013604,0.409657,0.869111
3,RAD_RAD_HP:0000969,Lab_HP:0004363,0.000966,Edema,Abnormal circulating calcium concentration,0.001782,4.7e-05,0.002795,0.930309,0.869158,0.07126,0.928527,0.869111
4,RAD_RAD_HP:0002086,Lab_HP:0004363,0.000759,Abnormality of the respiratory system,Abnormal circulating calcium concentration,0.00193,4.7e-05,0.002736,0.671973,0.869158,0.080085,0.670043,0.869111


In [11]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-428_corrected.csv', index=False)

In [12]:
#p_values = load_p_values('p_value_584_primary_only.obj')
synergy = synergies_rad_lab_primary_only['584']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']

In [13]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(20)
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-584_corrected.csv', index=False)

In [6]:
#p_values = load_p_values('p_value_038_primary_only.obj')
p_values = None
synergy = synergies_rad_lab_primary_only['038']
entropy_diag = entropy(synergy.case_N, synergy.control_N)
filtered_data = filtered_synergy_dataframe(synergy, p_values=p_values, percentile_cut = 5)
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf['P1'] = 'RAD_' + include_overall_mf['P1']
include_overall_mf['P2'] = 'Lab_' + include_overall_mf['P2']
include_overall_mf['synergy_norm'] = include_overall_mf['synergy'] / entropy_diag

In [7]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d,synergy_norm
0,RAD_HP:0002107,Lab_HP:0004363,0.001801,Pneumothorax,Abnormal circulating calcium concentration,0.000428,0.017524,0.019753,0.659875,0.869158,0.037493,0.659447,0.851634,0.005676
2,RAD_HP:0002107,Lab_HP:0004360,0.001728,Pneumothorax,Abnormality of acid-base homeostasis,0.000428,0.017165,0.019322,0.659875,0.836642,0.030492,0.659447,0.819477,0.005448
4,RAD_HP:0002107,Lab_HP:0010927,0.001728,Pneumothorax,Abnormal blood inorganic cation concentration,0.000428,0.018203,0.020359,0.659875,0.836345,0.035441,0.659447,0.818143,0.005446
6,RAD_HP:0002107,Lab_HP:0002901,0.001702,Pneumothorax,Hypocalcemia,0.000428,0.017857,0.019987,0.659875,0.898148,0.034895,0.659447,0.880291,0.005363
8,RAD_HP:0002107,Lab_HP:0002151,0.00162,Pneumothorax,Increased serum lactate,0.000428,0.016012,0.01806,0.659875,0.923926,0.021633,0.659447,0.907914,0.005105
10,RAD_HP:0002107,Lab_HP:0032368,0.001547,Pneumothorax,Acidemia,0.000428,0.007687,0.009662,0.659875,0.933695,0.041994,0.659447,0.926008,0.004876
12,RAD_HP:0002107,Lab_HP:0002795,0.001523,Pneumothorax,Functional respiratory abnormality,0.000428,0.006194,0.008145,0.659875,0.997066,0.037979,0.659447,0.990872,0.004799
14,RAD_HP:0002107,Lab_HP:0012415,0.001523,Pneumothorax,Abnormal blood gas level,0.000428,0.006194,0.008145,0.659875,0.997066,0.037979,0.659447,0.990872,0.004799
16,RAD_HP:0002107,Lab_HP:0010929,0.001512,Pneumothorax,Abnormal blood cation concentration,0.000428,0.01762,0.019561,0.659875,0.695458,0.028991,0.659447,0.677838,0.004765
18,RAD_HP:0002107,Lab_HP:0020062,0.001498,Pneumothorax,Decreased hemoglobin concentration,0.000428,0.007413,0.00934,0.659875,0.938389,0.046457,0.659447,0.930976,0.004722


In [37]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy-radiology_labtest_primary_only-038_corrected.csv', index=False)
edges_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'synergy', 'p', 'mf_d_P1_P2', 'mf_P1_P2']]
edges_cyto.to_csv('synergy_radiology_labtest_primary_only-038-edges.csv')
nodes_cyto = include_overall_mf.loc[:, ['P1', 'P2', 'P1_radiology_label', 'P2_lab_label', 'mf_d_P1', 'mf_d_P2', 'entropy_P1', 'entropy_P2', 'entropy_P1_given_d', 'entropy_P2_given_d']]
nodes_cyto_unique = pd.DataFrame(data={'P': np.concatenate((nodes_cyto.P1, nodes_cyto.P2)), \
                                       'source': np.repeat(['RAD', 'LAB'], len(nodes_cyto)), \
                  'P_label': np.concatenate((nodes_cyto.P1_radiology_label, nodes_cyto.P2_lab_label)), \
                  'mf_d_P': np.concatenate((nodes_cyto.mf_d_P1, nodes_cyto.mf_d_P2)), \
                  'entropy': np.concatenate((nodes_cyto.entropy_P1, nodes_cyto.entropy_P2)), \
                  'conditional_entropy': np.concatenate((nodes_cyto.entropy_P1_given_d, nodes_cyto.entropy_P2_given_d))}).drop_duplicates()
nodes_cyto_unique.to_csv('synergy_radiology_labtest_primary_only-038-nodes.csv')

In [39]:
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,p,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d,synergy_norm
0,RAD_HP:0002107,Lab_HP:0002151,0.002119,0.0,Pneumothorax,Increased serum lactate,0.000428,0.021219,0.023766,0.659875,0.923926,0.021633,0.659447,0.902707,0.006677
2,RAD_HP:0002107,Lab_HP:0032368,0.001683,0.0,Pneumothorax,Acidemia,0.000428,0.007872,0.009984,0.659875,0.933695,0.041994,0.659447,0.925823,0.005305
4,RAD_HP:0002107,Lab_HP:0002901,0.001509,0.0,Pneumothorax,Hypocalcemia,0.000428,0.01605,0.017988,0.659875,0.898148,0.034895,0.659447,0.882098,0.004757
6,RAD_HP:0002107,Lab_HP:0012418,0.001443,0.0,Pneumothorax,Hypoxemia,0.000428,0.010167,0.012038,0.659875,0.987873,0.035488,0.659447,0.977707,0.004548
8,RAD_HP:0002107,Lab_HP:0020062,0.00139,0.0,Pneumothorax,Decreased hemoglobin concentration,0.000428,0.010041,0.011859,0.659875,0.938389,0.046457,0.659447,0.928348,0.00438
10,RAD_HP:0002107,Lab_HP:0003256,0.001389,0.0,Pneumothorax,Abnormality of the coagulation cascade,0.000428,0.017719,0.019536,0.659875,0.970688,0.026865,0.659447,0.952969,0.004378
12,RAD_HP:0002107,Lab_HP:0004363,0.001377,0.0,Pneumothorax,Abnormal circulating calcium concentration,0.000428,0.01625,0.018055,0.659875,0.869158,0.037493,0.659447,0.852908,0.004339
14,RAD_HP:0002107,Lab_HP:0012200,0.001372,0.0,Pneumothorax,Abnormality of prothrombin,0.000428,0.016642,0.018443,0.659875,0.980079,0.023958,0.659447,0.963437,0.004325
16,RAD_HP:0002107,Lab_HP:0032199,0.001372,0.0,Pneumothorax,Abnormal prothrombin time,0.000428,0.016642,0.018443,0.659875,0.980079,0.023958,0.659447,0.963437,0.004325
18,RAD_HP:0002107,Lab_HP:0008151,0.001362,0.0,Pneumothorax,Prolonged prothrombin time,0.000428,0.015856,0.017647,0.659875,0.992744,0.024537,0.659447,0.976888,0.004293


In [41]:
idx_text = np.where(synergy.vars_labels['set1'] == 'HP:0002107')[0][0]
idx_lab = np.where(synergy.vars_labels['set2'] == 'HP:0002151')[0][0]
synergy.m2[idx_text, idx_lab, :] / np.sum(synergy.m2[idx_text, idx_lab, :])

array([0.00571419, 0.08383071, 0.00203473, 0.07937127, 0.03331864,
       0.21620659, 0.0163965 , 0.56312737])

In [34]:
entropy(10, 10)

1.0

## Synergy among Lab-derived Abnormal Phenotypes

In [50]:
with open('synergies-intra-labHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_labHpo = pickle.load(synergies_file)

In [51]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_labHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0001911,HP:0020064,0.005565,Abnormal granulocyte morphology,Abnormal eosinophil count,0.022505,0.000197,0.028266,0.176359,0.503569,0.000813,0.153855,0.503372
168,HP:0001880,HP:0001911,0.005565,Eosinophilia,Abnormal granulocyte morphology,0.000197,0.022505,0.028266,,,,,
110,HP:0001880,HP:0032309,0.005565,Eosinophilia,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
109,HP:0001879,HP:0032309,0.005565,Abnormal eosinophil morphology,Abnormal granulocyte count,0.000197,0.022505,0.028266,,,,,
169,HP:0002148,HP:0002905,0.004092,Hypophosphatemia,Hyperphosphatemia,0.009924,0.005304,0.01932,,,,,
117,HP:0001880,HP:0010974,0.003704,Eosinophilia,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
116,HP:0001879,HP:0010974,0.003704,Abnormal eosinophil morphology,Abnormal myeloid leukocyte morphology,0.000197,0.026392,0.030292,,,,,
1,HP:0010974,HP:0020064,0.003704,Abnormal myeloid leukocyte morphology,Abnormal eosinophil count,0.026392,0.000197,0.030292,0.183103,0.503569,0.000842,0.156711,0.503372
71,HP:0002904,HP:0011014,0.003596,Hyperbilirubinemia,Abnormal glucose homeostasis,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359
76,HP:0002904,HP:0011015,0.003596,Hyperbilirubinemia,Abnormal blood glucose concentration,0.00146,0.011067,0.016122,0.085283,0.656426,0.001573,0.083823,0.645359


In [53]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_labHpo_038_primary_only.csv', index=False)

In [54]:
with open('synergies-intra-textHpo-primary_only.obj', 'rb') as synergies_file:
    synergies_intra_textHpo = pickle.load(synergies_file)

In [55]:
#p_values = load_p_values('p_value_038_primary_only.obj')
synergy = synergies_intra_textHpo['038']
filtered_data = filtered_synergy_dataframe(synergy, p_values=None, percentile_cut = 5)
filtered_data = filtered_data[filtered_data.P1 < filtered_data.P2]
include_overall_mf = filtered_data.merge(mf_all, on=['P1', 'P2'], how='left')
include_overall_mf['entropy_P1_given_d'] = include_overall_mf['entropy_P1'] - include_overall_mf['mf_d_P1']
include_overall_mf['entropy_P2_given_d'] = include_overall_mf['entropy_P2'] - include_overall_mf['mf_d_P2']
include_overall_mf.sort_values(by='synergy', ascending=False).head(10)

Unnamed: 0,P1,P2,synergy,P1_radiology_label,P2_lab_label,mf_d_P1,mf_d_P2,mf_d_P1P2,entropy_P1,entropy_P2,mf_P1_P2,entropy_P1_given_d,entropy_P2_given_d
0,HP:0002107,HP:0002202,0.002421,Pneumothorax,Pleural effusion,0.000428,0.007408,0.010257,,,,,
2,HP:0002107,HP:0100750,0.001727,Pneumothorax,Atelectasis,0.000428,0.003481,0.005637,,,,,
4,HP:0002107,HP:0100806,0.001193,Pneumothorax,Sepsis,0.000428,0.048477,0.050099,,,,,
610,HP:0002090,HP:0002107,0.00116,Pneumonia,Pneumothorax,0.011828,0.000428,0.013417,,,,,
6,HP:0002107,HP:0100598,0.001033,Pneumothorax,Pulmonary edema,0.000428,0.006642,0.008104,,,,,
611,HP:0001640,HP:0002107,0.000739,Cardiomegaly,Pneumothorax,0.003249,0.000428,0.004416,,,,,
612,HP:0000969,HP:0002107,0.000611,Edema,Pneumothorax,0.004273,0.000428,0.005312,,,,,
8,HP:0002107,HP:0002878,0.000597,Pneumothorax,Respiratory failure,0.000428,0.006298,0.007324,,,,,
613,HP:0001945,HP:0002107,0.000516,Fever,Pneumothorax,0.01215,0.000428,0.013094,,,,,
10,HP:0002107,HP:0002835,0.00044,Pneumothorax,Aspiration,0.000428,0.003259,0.004127,,,,,


In [56]:
include_overall_mf.sort_values(by='synergy', ascending=False).to_csv('synergy_intra_textHpo_038_primary_only.csv', index=False)