# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
from obonet.ontology import Ontology
import pickle

In [3]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')
#TODO: replace the above with the onbonet tool
#hpo2 = Ontology('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

# Mutual information without considering diagnosis

In [3]:
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_labHpo.obj', 'rb') as f:
    summary_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_textHpo.obj', 'rb') as f:
    summary_textHpo_textHpo = pickle.load(f)  
with open('../../../data/mf_regardless_of_diseases/summary_labHpo_labHpo.obj', 'rb') as f:
    summary_labHpo_labHpo = pickle.load(f)

In [4]:
mf_textHpo_labHpo = mf.MutualInfoXY(summary_textHpo_labHpo)
mf_textHpo_textHpo = mf.MutualInfoXY(summary_textHpo_textHpo)
mf_labHpo_labHpo = mf.MutualInfoXY(summary_labHpo_labHpo)

## TextHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [5]:
df_mf_textHpo_labHpo = mf_textHpo_labHpo.mf_labeled()
# add labels
df_mf_textHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P1])
df_mf_textHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[df_mf_textHpo_labHpo.P1 < df_mf_textHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_labHpo.P1[i], df_mf_textHpo_labHpo.P2[i]) for i in np.arange(df_mf_textHpo_labHpo.shape[0])])
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
#df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_labHpo.csv')
df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
1968,HP:0002202,HP:0020062,0.12566,Pleural effusion,Decreased hemoglobin concentration
1958,HP:0002202,HP:0020061,0.121226,Pleural effusion,Abnormal hemoglobin concentration
1959,HP:0002202,HP:0011015,0.12101,Pleural effusion,Abnormal blood glucose concentration
1960,HP:0002202,HP:0011014,0.12101,Pleural effusion,Abnormal glucose homeostasis
1962,HP:0002202,HP:0031851,0.117206,Pleural effusion,Reduced hematocrit
1957,HP:0002202,HP:0020058,0.115497,Pleural effusion,Abnormal red blood cell count
1961,HP:0002202,HP:0010929,0.115105,Pleural effusion,Abnormal blood cation concentration
1972,HP:0002202,HP:0004363,0.113134,Pleural effusion,Abnormal circulating calcium concentration
1967,HP:0002202,HP:0010927,0.112096,Pleural effusion,Abnormal blood inorganic cation concentration
1956,HP:0002202,HP:0031850,0.111859,Pleural effusion,Abnormal hematocrit


## TextHpo -- TextHpo
Their mutual information tells how much they correlate with each other.

In [6]:
df_mf_textHpo_textHpo = mf_textHpo_textHpo.mf_labeled()
# add labels
df_mf_textHpo_textHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P1])
df_mf_textHpo_textHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[df_mf_textHpo_textHpo.P1 < df_mf_textHpo_textHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_textHpo.P1[i], df_mf_textHpo_textHpo.P2[i]) for i in np.arange(df_mf_textHpo_textHpo.shape[0])])
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_textHpo.csv')
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
12173,HP:0001892,HP:0011028,0.706285,Abnormal bleeding,Abnormality of blood circulation
5481,HP:0011947,HP:0012649,0.625522,Respiratory tract infection,Increased inflammatory response
5480,HP:0011947,HP:0012647,0.625481,Respiratory tract infection,Abnormal inflammatory response
3229,HP:0010978,HP:0011947,0.565861,Abnormality of immune system physiology,Respiratory tract infection
11385,HP:0011024,HP:0025033,0.543392,Abnormality of the gastrointestinal tract,Abnormality of digestive system morphology
2329,HP:0002715,HP:0011947,0.481571,Abnormality of the immune system,Respiratory tract infection
1565,HP:0000969,HP:0002103,0.455301,Edema,Abnormality of the pleura
2876,HP:0002103,HP:0011032,0.451888,Abnormality of the pleura,Abnormality of fluid regulation
12690,HP:0011029,HP:0100659,0.450224,Internal hemorrhage,Abnormality of the cerebral vasculature
12524,HP:0011028,HP:0100659,0.447784,Abnormality of blood circulation,Abnormality of the cerebral vasculature


## LabHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [115]:
df_mf_labHpo_labHpo = mf_labHpo_labHpo.mf_labeled()
# add labels
df_mf_labHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P1])
df_mf_labHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[df_mf_labHpo_labHpo.P1 < df_mf_labHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_labHpo_labHpo.P1[i], df_mf_labHpo_labHpo.P2[i]) for i in np.arange(df_mf_labHpo_labHpo.shape[0])])
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_labHpo_labHpo.csv')
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
0,HP:0002157,HP:0031970,0.744493,Azotemia,Abnormal blood urea nitrogen concentration
1,HP:0020061,HP:0031850,0.575781,Abnormal hemoglobin concentration,Abnormal hematocrit
2,HP:0020062,HP:0031851,0.546701,Decreased hemoglobin concentration,Reduced hematocrit
3,HP:0020058,HP:0020061,0.543036,Abnormal red blood cell count,Abnormal hemoglobin concentration
4,HP:0020058,HP:0031850,0.533428,Abnormal red blood cell count,Abnormal hematocrit
5,HP:0020060,HP:0020062,0.514094,Decreased red blood cell count,Decreased hemoglobin concentration
6,HP:0500164,HP:0500165,0.499272,Abnormal blood carbon dioxide level,Abnormal blood oxygen level
7,HP:0020060,HP:0031851,0.48535,Decreased red blood cell count,Reduced hematocrit
8,HP:0020061,HP:0031851,0.433088,Abnormal hemoglobin concentration,Reduced hematocrit
9,HP:0001882,HP:0004332,0.417853,Leukopenia,Abnormal lymphocyte morphology


## Mutual information between textHpo and labHpo in respect to diagnoses

At each admission, patients could receive multiple diagnosis codes. One of them is designated as "primary" (in MIMIC, it has a rank of 1) and others secondary (rank 2, 3...). Therefore, the analysis was run under two scenerios: 
1. Only primary diagnosis is considered. 
2. All diagnoses are considered equally. 

Under the first scenerio, a patient is considered to be a case only if the corresponding billing code is listed as "primary". While in the second case, a patient is considered to be a case when the corresponding billing code is listed as primary or secondary.   

In [4]:
def mf_dataframes(mf_diagnosis_phenotypes, p_mf_Xz, p_mf_Yz, p_mf_XY_z, p_mf_XY_given_z, p_synergy, p_mf_XY_omit_z):
    X_labels, Y_labels = mf_diagnosis_phenotypes.vars_labels.values()
    M1 = len(X_labels)
    M2 = len(Y_labels)

    mf_Xz = mf_diagnosis_phenotypes.mutual_info_Xz()
    mf_Yz = mf_diagnosis_phenotypes.mutual_info_Yz()

    # mutual information between single phenotypes and diagnosis
    df_mf_Xz = pd.DataFrame(data={'X': X_labels, 'mf_Xz': mf_Xz})
    df_mf_Yz = pd.DataFrame(data={'Y': Y_labels, 'mf_Yz': mf_Yz})
    
    # joint and conditional mutual information, and synergy
    mf_XY_z = mf_diagnosis_phenotypes.mutual_info_XY_z()
    mf_XY_given_z = mf_diagnosis_phenotypes.mutual_info_XY_given_z()
    mf_synergy = mf_diagnosis_phenotypes.synergy_XY2z()
    
    # mutual information between phenotypes without considering diagnosis
    mf_XY_omit_z = mf_diagnosis_phenotypes.mutual_info_XY_omit_z()
    
    # mutual information between phenotype pairs and diagnosis
    df_mf_XY_z = pd.DataFrame()
    df_mf_XY_z['X'] = np.repeat(X_labels, M2)
    df_mf_XY_z['Y'] = np.tile(Y_labels, [M1])
    df_mf_XY_z['mf_XY_z'] = mf_XY_z.flat
    df_mf_XY_z['mf_XY_given_z'] = mf_XY_given_z.flat
    df_mf_XY_z['synergy'] = mf_synergy.flat
    
    # mutual information between phenotypes after omiting diagnosis
    df_mf_XY_z['mf_XY_omit_z'] = mf_XY_omit_z.flat
    
    # add p values
    if p_mf_Xz is not None:
        df_mf_Xz['p_mf_Xz'] = p_mf_Xz
    if p_mf_Yz is not None:
        df_mf_Yz['p_mf_Yz'] = p_mf_Yz
    if p_mf_XY_z is not None:
        df_mf_XY_z['p_mf_XY_z'] = p_mf_XY_z.flat
    if p_mf_XY_given_z is not None:
        df_mf_XY_z['p_mf_XY_given_z'] = p_mf_XY_given_z.flat
    if p_synergy is not None:
        df_mf_XY_z['p_synergy'] = p_synergy.flat
    if p_mf_XY_omit_z is not None:
        df_mf_XY_z['p_mf_XY_omit_z'] = p_mf_XY_omit_z.flat

    return df_mf_Xz, df_mf_Yz, df_mf_XY_z

def filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z):
    df_merged = df_mf_XY_z \
        .merge(df_mf_Xz, how='left', on=['X']) \
        .merge(df_mf_Yz, how='left', on=['Y'])

    ## filter out identifical pairs: a, b is the same as b, a
    df_filtered = df_merged.loc[df_merged.X < df_merged.Y, :].reset_index(drop=True)
    mask = np.array([hpo.has_dependency(df_filtered.X[i], df_filtered.Y[i]) for i in np.arange(len(df_filtered))])
    df_filtered = df_filtered.loc[np.logical_not(mask), ].reset_index(drop=True)
    return df_filtered

def entropy(case, control):
    total = case + control 
    h = -(case / total * np.log2(case/total) + control/total * np.log2(control/total))
    return h
    

def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

convert_to_percent = np.vectorize(lambda x: ' {:.2f}%'.format(x * 100))

In [37]:
primary_only = True
if primary_only:
    diag_dir = "primary_only"
else:
    diag_dir = "primary_and_secondary"

    
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_textHpo_labHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_textHpo_textHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_textHpo_textHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_labHpo_labHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_labHpo_labHpo = pickle.load(f)

In [38]:
summaries_diagnosis_textHpo_labHpo.keys()

dict_keys(['428', '584', '038', '493'])

### textHpo-labHpo pairs

In [39]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_textHpo_labHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

FileNotFoundError: [Errno 2] No such file or directory: '../../../data/mf_regarding_diseases/primary_only/493/p_value_textHpo_labHpo_493_primary_only.obj'

In [14]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,+--,-++,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz
0,HP:0002086,HP:0032180,0.002743,0.109609,0.000876,0.108733,0.0,0.0,0.0,0.0,...,3828,348,4846,98,5107,58976,0.000832,0.0,0.001035,0.0
1,HP:0002086,HP:0012337,0.004215,0.111467,0.000536,0.110931,0.0,0.0,0.0,0.0,...,6218,340,3882,106,6071,58976,0.000832,0.0,0.002847,0.0
2,HP:0002086,HP:0020058,0.003292,0.103277,0.000844,0.102433,0.0,0.0,0.0,0.0,...,8410,299,3385,147,6568,58976,0.000832,0.0,0.001616,0.0
3,HP:0002086,HP:0003111,0.002853,0.100583,0.001111,0.099472,0.0,0.0,0.0,0.0,...,7515,304,3722,142,6231,58976,0.000832,0.0,0.000911,0.0
4,HP:0002086,HP:0031850,0.002826,0.103273,0.000858,0.102414,0.0,0.0,0.0,0.0,...,8177,290,3457,156,6496,58976,0.000832,0.0,0.001136,0.0


In [15]:
p_values.keys()

dict_keys(['mf_Xz', 'mf_Yz', 'mf_XY_z', 'mf_XY_given_z', 'synergy', 'mf_XY_omit_z'])

In [16]:
# mutual information between textHpo and diagnosis
df_mf_textHpo_diagnosis = df_mf_Xz \
    .assign(X_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Xz.X])) \
    .sort_values(by='mf_Xz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P', 'mf_Xz': 'mf_P_diag', 'p_mf_Xz': 'p_mf_P_diag', 'X_label': 'P_label'})

df_mf_textHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/{}/mf_textHpo_diag_{}.csv'.format(diag_dir,disease))
df_mf_textHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0000001,0.004155,0.0,All
1,HP:0000118,0.004155,0.0,Phenotypic abnormality
2,HP:0002086,0.000832,0.0,Abnormality of the respiratory system
3,HP:0002098,0.000831,0.0,Respiratory distress
4,HP:0002088,0.000544,0.0,Abnormal lung morphology


In [17]:
# mutual information between labHpo and diagnosis
df_mf_labHpo_diagnosis = df_mf_Yz \
    .assign(Y_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Yz.Y])) \
    .sort_values(by='mf_Yz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'Y': 'P', 'mf_Yz': 'mf_P_diag', 'p_mf_Yz': 'p_mf_P_diag', 'Y_label': 'P_label'})

df_mf_labHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/{}/mf_labHpo_diag_{}.csv'.format(diag_dir, disease))
df_mf_labHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0001939,0.003839,0.0,Abnormality of metabolism/homeostasis
1,HP:0012337,0.002847,0.0,Abnormal homeostasis
2,HP:0011014,0.002267,0.0,Abnormal glucose homeostasis
3,HP:0011015,0.002267,0.0,Abnormal blood glucose concentration
4,HP:0020058,0.001616,0.0,Abnormal red blood cell count


In [18]:
# save synergies
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.rename(columns={'X':'P1', 'Y': 'P2'}) \
    .sort_values(by=['synergy'], ascending=False) \
    .reset_index(drop=True) \
    .loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'synergy', 'p_mf_Xz', 'p_mf_Yz', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .rename(columns={'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [19]:
# save ratios between conditional mutual info and overall mutual info
df_mf_vs_conditional_mf = df_mf_XY_z_filtered \
    .assign(mf_ratio=df_mf_XY_z_filtered.mf_XY_given_z/df_mf_XY_z_filtered.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2'})
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_textHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head(n=10)

Unnamed: 0,P1,P2,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0002107,HP:0031965,0.000208,0.002613,0.000149,0.002464,0.000437,0.0,0.000375,0.0,...,1911,31718,58976,9e-06,0.882187,5e-05,0.046312,Pneumothorax,Increased RBC distribution width,1.060586
1,HP:0002094,HP:0025546,0.000916,0.010196,0.000543,0.009654,0.0,0.0,0.0,0.0,...,1451,25611,58976,0.00037,0.0,3e-06,0.573875,Dyspnea,Abnormal mean corpuscular hemoglobin concentra...,1.056198
2,HP:0002098,HP:0031851,0.001683,0.009019,0.000357,0.008662,0.0,0.0,0.0,0.0,...,891,18411,58976,0.000831,0.0,0.000494,0.0,Respiratory distress,Reduced hematocrit,1.041258
3,HP:0002086,HP:0025546,0.002205,0.035473,0.00137,0.034103,0.0,0.0,0.0,0.0,...,265,7948,58976,0.000832,0.0,3e-06,0.573875,Abnormality of the respiratory system,Abnormal mean corpuscular hemoglobin concentra...,1.040166
4,HP:0002098,HP:0020060,0.001297,0.010772,0.000406,0.010365,0.0,0.0,0.0,0.0,...,1356,24625,58976,0.000831,0.0,5.9e-05,0.027937,Respiratory distress,Decreased red blood cell count,1.039192
5,HP:0011458,HP:0025546,0.000485,0.01052,0.000353,0.010166,0.0,0.0,0.0,0.0,...,1488,25449,58976,0.000129,0.000938,3e-06,0.573875,Abdominal symptom,Abnormal mean corpuscular hemoglobin concentra...,1.034748
6,HP:0002795,HP:0025546,0.000802,0.018817,0.000625,0.018193,0.0,0.0,0.0,0.0,...,1063,19601,58976,0.000174,6.2e-05,3e-06,0.573875,Functional respiratory abnormality,Abnormal mean corpuscular hemoglobin concentra...,1.034332
7,HP:0002098,HP:0020062,0.001396,0.00958,0.000316,0.009264,0.0,0.0,0.0,0.0,...,1185,22633,58976,0.000831,0.0,0.000249,0.0,Respiratory distress,Decreased hemoglobin concentration,1.034076
8,HP:0001626,HP:0025546,0.001216,0.028766,0.000945,0.02782,0.0,0.0,0.0,0.0,...,616,12990,58976,0.000267,0.0,3e-06,0.573875,Abnormality of the cardiovascular system,Abnormal mean corpuscular hemoglobin concentra...,1.033985
9,HP:0011793,HP:0025546,0.000512,0.011683,0.000381,0.011302,0.0,0.0,0.0,0.0,...,1406,24294,58976,0.000128,0.000687,3e-06,0.573875,Neoplasm by anatomical site,Abnormal mean corpuscular hemoglobin concentra...,1.033727


In [20]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)


# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [21]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label]),
                          'type': np.repeat(['Rad', 'Lab'], len(df_4_cytoscape))}).drop_duplicates()
nodes.drop_duplicates().reset_index(drop=True).to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

### labHpo-labHpo pairs

In [22]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_labHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_labHpo_labHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_labHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [23]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
#df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True)
    #.rename(columns={'mf_joint': 'mf_P1P2_diag'}) \
    #.loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.loc[:, ['X', 'Y', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'p_mf_Xz', 'p_mf_Yz', 'P1_label', 'P2_label']] \
    .rename(columns={'X': 'P1', 'Y': 'P2', 'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_joint_vs_individual_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001872,HP:0001939,0.004497,0.06556,0.00052,0.06504,0.0,0.0,0.0,0.0,...,29746,98,6060,58976,0.000138,0.00075,0.003839,0.0,Abnormal thrombocyte morphology,Abnormality of metabolism/homeostasis
1,HP:0001939,HP:0011873,0.004497,0.06556,0.00052,0.06504,0.0,0.0,0.0,0.0,...,40,98,6060,58976,0.003839,0.0,0.000138,0.000625,Abnormality of metabolism/homeostasis,Abnormal platelet count
2,HP:0011014,HP:0020058,0.004447,0.30116,0.000565,0.300595,0.0,0.0,0.0,0.0,...,4922,354,12180,58976,0.002267,0.0,0.001616,0.0,Abnormal glucose homeostasis,Abnormal red blood cell count
3,HP:0011015,HP:0020058,0.004447,0.30116,0.000565,0.300595,0.0,0.0,0.0,0.0,...,4922,354,12180,58976,0.002267,0.0,0.001616,0.0,Abnormal blood glucose concentration,Abnormal red blood cell count
4,HP:0012337,HP:0031850,0.004356,0.255591,0.000373,0.255218,0.0,0.0,0.0,0.0,...,2593,236,9696,58976,0.002847,0.0,0.001136,0.0,Abnormal homeostasis,Abnormal hematocrit


In [24]:
entropy(mf_diagnosis_phenotypes.case_N, mf_diagnosis_phenotypes.control_N) 

0.3258570694530306

In [25]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2','mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
506,HP:0020062,HP:0020063,0.001586,0.008273,0.001325,0.006948,0.0,0.0,0.0,0.0,...,9170,789,16377,58976,0.000249,0.0,1.2e-05,0.973375,Decreased hemoglobin concentration,Increased hemoglobin concentration
247,HP:0010929,HP:0012337,0.004283,0.283318,0.001264,0.282054,0.0,0.0,0.0,0.0,...,7373,329,10992,58976,0.000172,0.000188,0.002847,0.0,Abnormal blood cation concentration,Abnormal homeostasis
133,HP:0020058,HP:0025546,0.002785,0.104413,0.001166,0.103247,0.0,0.0,0.0,0.0,...,2002,492,12976,58976,0.001616,0.0,3e-06,0.56725,Abnormal red blood cell count,Abnormal mean corpuscular hemoglobin concentra...
966,HP:0003138,HP:0031850,0.002263,0.147619,0.001121,0.146498,0.0,0.0,0.0,0.0,...,17747,560,13464,58976,6e-06,0.7235,0.001136,0.0,Increased blood urea nitrogen,Abnormal hematocrit
965,HP:0003138,HP:0020058,0.002713,0.161694,0.001091,0.160603,0.0,0.0,0.0,0.0,...,17323,550,13888,58976,6e-06,0.7235,0.001616,0.0,Increased blood urea nitrogen,Abnormal red blood cell count
616,HP:0020060,HP:0020063,0.001161,0.012504,0.00109,0.011414,0.0,0.0,0.0,0.0,...,9749,950,18073,58976,5.9e-05,0.0275,1.2e-05,0.973375,Decreased red blood cell count,Increased hemoglobin concentration
1001,HP:0025546,HP:0031850,0.002224,0.103692,0.001085,0.102607,0.0,0.0,0.0,0.0,...,18573,517,12749,58976,3e-06,0.569375,0.001136,0.0,Abnormal mean corpuscular hemoglobin concentra...,Abnormal hematocrit
728,HP:0002901,HP:0011014,0.003348,0.214282,0.001069,0.213213,0.0,0.0,0.0,0.0,...,12730,605,15403,58976,1.3e-05,0.993563,0.002267,0.0,Hypocalcemia,Abnormal glucose homeostasis
729,HP:0002901,HP:0011015,0.003348,0.214282,0.001069,0.213213,0.0,0.0,0.0,0.0,...,12730,605,15403,58976,1.3e-05,0.993563,0.002267,0.0,Hypocalcemia,Abnormal blood glucose concentration
132,HP:0020058,HP:0031970,0.002704,0.176991,0.00106,0.175931,0.0,0.0,0.0,0.0,...,1175,542,13803,58976,0.001616,0.0,2.9e-05,0.123687,Abnormal red blood cell count,Abnormal blood urea nitrogen concentration


In [26]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001871,HP:0001939,0.004046,0.078262,-0.000275,0.078537,0.0,0.0,0.0,0.0,...,552,49,1826,58976,0.000483,0.0,0.003839,0.0,Abnormality of blood and blood-forming tissues,Abnormality of metabolism/homeostasis
1,HP:0001871,HP:0032180,0.001196,0.071541,-0.000322,0.071863,0.0,0.0,0.0,0.0,...,398,68,1980,58976,0.000483,0.0,0.001035,0.0,Abnormality of blood and blood-forming tissues,Abnormal circulating metabolite concentration
2,HP:0001871,HP:0012337,0.002898,0.072351,-0.000432,0.072783,0.0,0.0,0.0,0.0,...,184,62,2194,58976,0.000483,0.0,0.002847,0.0,Abnormality of blood and blood-forming tissues,Abnormal homeostasis
3,HP:0001871,HP:0003111,0.001075,0.072144,-0.000319,0.072462,0.0,0.0,0.0,0.0,...,123,74,2255,58976,0.000483,0.0,0.000911,0.0,Abnormality of blood and blood-forming tissues,Abnormal blood ion concentration
4,HP:0001871,HP:0011014,0.002343,0.063991,-0.000407,0.064398,0.0,0.0,0.0,0.0,...,75,74,2303,58976,0.000483,0.0,0.002267,0.0,Abnormality of blood and blood-forming tissues,Abnormal glucose homeostasis


In [27]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0020059,HP:0020060,0.001481,0.00106,0.000943,0.000117,0.0,0.0,0.0,0.001625,...,784,16542,58976,0.000478,0.0,5.9e-05,0.027062,Increased red blood cell count,Decreased red blood cell count,9.08173
1,HP:0020062,HP:0020063,0.001586,0.008273,0.001325,0.006948,0.0,0.0,0.0,0.0,...,789,16377,58976,0.000249,0.0,1.2e-05,0.973375,Decreased hemoglobin concentration,Increased hemoglobin concentration,1.190746
2,HP:0020060,HP:0020063,0.001161,0.012504,0.00109,0.011414,0.0,0.0,0.0,0.0,...,950,18073,58976,5.9e-05,0.0275,1.2e-05,0.973375,Decreased red blood cell count,Increased hemoglobin concentration,1.095522
3,HP:0020063,HP:0031965,0.000454,0.004641,0.000393,0.004248,0.0,0.0,0.0,0.0,...,1320,22927,58976,1.2e-05,0.976562,5e-05,0.045438,Increased hemoglobin concentration,Increased RBC distribution width,1.092491
4,HP:0020059,HP:0020062,0.001499,0.010138,0.000772,0.009365,0.0,0.0,0.0,0.0,...,767,16577,58976,0.000478,0.0,0.000249,0.0,Increased red blood cell count,Decreased hemoglobin concentration,1.082462


In [28]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Lab_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_labHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [29]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Lab', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_labHpo_labHpo_{}.csv'.format(diag_dir, disease))

### textHpo-textHpo pairs

In [30]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_textHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_textHpo_textHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_textHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [31]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'mf_P1P2_diag','p_mf_P1_diag', 'p_mf_P2_diag', 'p_mf_P1P2_diag', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.to_csv('../../../data/mf_regarding_diseases/{}/df_mf_joint_vs_individual_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,P1,P2,mf_P1_diag,mf_P2_diag,mf_P1P2_diag,p_mf_P1_diag,p_mf_P2_diag,p_mf_P1P2_diag,P1_label,P2_label
0,HP:0001626,HP:0002086,0.000267,0.000832,0.001822,0.0,0.0,0.0,Abnormality of the cardiovascular system,Abnormality of the respiratory system
1,HP:0002086,HP:0025031,0.000832,0.000192,0.00181,0.0,0.000125,0.0,Abnormality of the respiratory system,Abnormality of the digestive system
2,HP:0001626,HP:0002088,0.000267,0.000544,0.001462,0.0,0.0,0.0,Abnormality of the cardiovascular system,Abnormal lung morphology
3,HP:0002086,HP:0030680,0.000832,0.000149,0.001415,0.0,0.000563,0.0,Abnormality of the respiratory system,Abnormality of cardiovascular system morphology
4,HP:0001626,HP:0012252,0.000267,0.000523,0.001405,0.0,0.0,0.0,Abnormality of the cardiovascular system,Abnormal respiratory system morphology


In [32]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
2,HP:0002086,HP:0025031,0.00181,0.062216,0.000786,0.06143,0.0,0.0,0.0,0.0,...,2176,263,7777,58976,0.000832,0.0,0.000192,0.000125,Abnormality of the respiratory system,Abnormality of the digestive system
196,HP:0001626,HP:0002086,0.001822,0.085992,0.000723,0.085269,0.0,0.0,0.0,0.0,...,11413,219,6937,58976,0.000267,0.0,0.000832,0.0,Abnormality of the cardiovascular system,Abnormality of the respiratory system
48,HP:0012252,HP:0025031,0.001392,0.064586,0.000677,0.063908,0.0,0.0,0.0,0.0,...,2904,352,8945,58976,0.000523,0.0,0.000192,0.000125,Abnormal respiratory system morphology,Abnormality of the digestive system
198,HP:0001626,HP:0002088,0.001462,0.086665,0.000651,0.086014,0.0,0.0,0.0,0.0,...,10494,287,7856,58976,0.000267,0.0,0.000544,0.0,Abnormality of the cardiovascular system,Abnormal lung morphology
550,HP:0000924,HP:0100750,0.000952,0.051986,0.000638,0.051348,0.0,0.0,0.0,0.0,...,9522,1002,19326,58976,0.000206,0.0,0.000108,0.003,Abnormality of the skeletal system,Atelectasis
69,HP:0002088,HP:0025031,0.001368,0.064917,0.000632,0.064285,0.0,0.0,0.0,0.0,...,2941,356,9013,58976,0.000544,0.0,0.000192,0.000125,Abnormal lung morphology,Abnormality of the digestive system
197,HP:0001626,HP:0012252,0.001405,0.085616,0.000615,0.085001,0.0,0.0,0.0,0.0,...,10565,287,7785,58976,0.000267,0.0,0.000523,0.0,Abnormality of the cardiovascular system,Abnormal respiratory system morphology
346,HP:0025031,HP:0100750,0.000872,0.054939,0.000572,0.054367,0.0,0.0,0.0,0.0,...,8048,916,17790,58976,0.000192,0.000125,0.000108,0.003,Abnormality of the digestive system,Atelectasis
797,HP:0011842,HP:0100750,0.000729,0.045068,0.000491,0.044577,0.0,0.0,0.0,0.0,...,12313,1186,21838,58976,0.00013,0.00125,0.000108,0.003,Abnormality of skeletal morphology,Atelectasis
1220,HP:0002090,HP:0011032,0.001179,0.046704,0.000471,0.046233,0.0,0.0,0.0,0.0,...,21788,799,16420,58976,0.000324,0.0,0.000384,0.0,Pneumonia,Abnormality of fluid regulation


In [33]:
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0020059,HP:0020060,0.001481,0.00106,0.000943,0.000117,0.0,0.0,0.0,0.001625,...,784,16542,58976,0.000478,0.0,5.9e-05,0.027062,Increased red blood cell count,Decreased red blood cell count,9.08173
1,HP:0020062,HP:0020063,0.001586,0.008273,0.001325,0.006948,0.0,0.0,0.0,0.0,...,789,16377,58976,0.000249,0.0,1.2e-05,0.973375,Decreased hemoglobin concentration,Increased hemoglobin concentration,1.190746
2,HP:0020060,HP:0020063,0.001161,0.012504,0.00109,0.011414,0.0,0.0,0.0,0.0,...,950,18073,58976,5.9e-05,0.0275,1.2e-05,0.973375,Decreased red blood cell count,Increased hemoglobin concentration,1.095522
3,HP:0020063,HP:0031965,0.000454,0.004641,0.000393,0.004248,0.0,0.0,0.0,0.0,...,1320,22927,58976,1.2e-05,0.976562,5e-05,0.045438,Increased hemoglobin concentration,Increased RBC distribution width,1.092491
4,HP:0020059,HP:0020062,0.001499,0.010138,0.000772,0.009365,0.0,0.0,0.0,0.0,...,767,16577,58976,0.000478,0.0,0.000249,0.0,Increased red blood cell count,Decreased hemoglobin concentration,1.082462


In [34]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0002098,HP:0002617,0.001037,0.0087,0.000183,0.008518,0.0,0.0,0.000125,0.0,...,2346,39544,58976,0.000831,0.0,2.3e-05,0.4505,Respiratory distress,Dilatation,1.021468
1,HP:0002107,HP:0410042,0.000271,0.007761,0.000154,0.007607,6.2e-05,0.0,0.000188,0.0,...,2334,38504,58976,9e-06,0.884188,0.000107,0.003563,Pneumothorax,Abnormal liver morphology,1.020296
2,HP:0001945,HP:0002107,9.6e-05,0.003974,6e-05,0.003915,0.040188,0.0,0.010875,0.0,...,2448,39610,58976,2.7e-05,0.141625,9e-06,0.884125,Fever,Pneumothorax,1.015233
3,HP:0002098,HP:0410042,0.001068,0.008804,0.00013,0.008674,0.0,0.0,0.000625,0.0,...,2233,38993,58976,0.000831,0.0,0.000107,0.003563,Respiratory distress,Abnormal liver morphology,1.014965
4,HP:0001627,HP:0002107,0.000326,0.014761,0.000217,0.014544,0.000188,0.0,0.0,0.0,...,1845,31126,58976,9.9e-05,0.004875,9e-06,0.884125,Abnormal heart morphology,Pneumothorax,1.014948
5,HP:0000924,HP:0002098,0.001317,0.019068,0.00028,0.018788,0.0,0.0,0.0,0.0,...,1399,25894,58976,0.000206,0.0,0.000831,0.0,Abnormality of the skeletal system,Respiratory distress,1.014905
6,HP:0002098,HP:0002107,0.000941,0.006906,0.000101,0.006806,0.0,0.0,0.001625,0.0,...,2303,39410,58976,0.000831,0.0,9e-06,0.884125,Respiratory distress,Pneumothorax,1.014775
7,HP:0002242,HP:0002835,0.00028,0.012797,0.000186,0.012611,6.2e-05,0.0,0.000125,0.0,...,2527,41268,58976,9.1e-05,0.006062,3e-06,0.556812,Abnormal intestine morphology,Aspiration,1.014749
8,HP:0002835,HP:0030680,0.000492,0.023709,0.00034,0.023369,0.0,0.0,0.0,0.0,...,1341,23518,58976,3e-06,0.556438,0.000149,0.000563,Aspiration,Abnormality of cardiovascular system morphology,1.014541
9,HP:0001626,HP:0002835,0.000599,0.023775,0.000329,0.023446,0.0,0.0,0.0,0.0,...,918,17098,58976,0.000267,0.0,3e-06,0.556812,Abnormality of the cardiovascular system,Aspiration,1.014022


In [35]:
# save data for cytoscape
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Rad_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_textHpo_textHpo_{}.csv'.format(diag_dir, disease))

In [36]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Rad', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_textHpo_textHpo_{}.csv'.format(diag_dir, disease))