# Phenotype Synergy Analysis

This notebook contains code to interprete results from the synergy score analysis. 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
from obonet.ontology import Ontology
import pickle

In [3]:
hpo = hpoutil.HPO('/Users/zhangx/git/human-phenotype-ontology/hp.obo')
#TODO: replace the above with the new onbonet tool: https://github.com/kingmanzhang/obonet
#hpo2 = Ontology('/Users/zhangx/git/human-phenotype-ontology/hp.obo')

# Mutual information without considering diagnosis

This section analyzes the mutual information between phenotype pairs (labHpo-labHpo, textHpo-labHpo, textHpo-textHpo) in regardless of diagnosis. 

Note that the same information is also produced as when we calculate mutual information in respect to a disease. 

In [3]:
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_labHpo.obj', 'rb') as f:
    summary_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regardless_of_diseases/summary_textHpo_textHpo.obj', 'rb') as f:
    summary_textHpo_textHpo = pickle.load(f)  
with open('../../../data/mf_regardless_of_diseases/summary_labHpo_labHpo.obj', 'rb') as f:
    summary_labHpo_labHpo = pickle.load(f)

In [4]:
mf_textHpo_labHpo = mf.MutualInfoXY(summary_textHpo_labHpo)
mf_textHpo_textHpo = mf.MutualInfoXY(summary_textHpo_textHpo)
mf_labHpo_labHpo = mf.MutualInfoXY(summary_labHpo_labHpo)

## TextHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [5]:
df_mf_textHpo_labHpo = mf_textHpo_labHpo.mf_labeled()
# add labels
df_mf_textHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P1])
df_mf_textHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[df_mf_textHpo_labHpo.P1 < df_mf_textHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_labHpo.P1[i], df_mf_textHpo_labHpo.P2[i]) for i in np.arange(df_mf_textHpo_labHpo.shape[0])])
df_mf_textHpo_labHpo = df_mf_textHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
#df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_labHpo.csv')
df_mf_textHpo_labHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
1968,HP:0002202,HP:0020062,0.12566,Pleural effusion,Decreased hemoglobin concentration
1958,HP:0002202,HP:0020061,0.121226,Pleural effusion,Abnormal hemoglobin concentration
1959,HP:0002202,HP:0011015,0.12101,Pleural effusion,Abnormal blood glucose concentration
1960,HP:0002202,HP:0011014,0.12101,Pleural effusion,Abnormal glucose homeostasis
1962,HP:0002202,HP:0031851,0.117206,Pleural effusion,Reduced hematocrit
1957,HP:0002202,HP:0020058,0.115497,Pleural effusion,Abnormal red blood cell count
1961,HP:0002202,HP:0010929,0.115105,Pleural effusion,Abnormal blood cation concentration
1972,HP:0002202,HP:0004363,0.113134,Pleural effusion,Abnormal circulating calcium concentration
1967,HP:0002202,HP:0010927,0.112096,Pleural effusion,Abnormal blood inorganic cation concentration
1956,HP:0002202,HP:0031850,0.111859,Pleural effusion,Abnormal hematocrit


## TextHpo -- TextHpo
Their mutual information tells how much they correlate with each other.

In [6]:
df_mf_textHpo_textHpo = mf_textHpo_textHpo.mf_labeled()
# add labels
df_mf_textHpo_textHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P1])
df_mf_textHpo_textHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_textHpo_textHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[df_mf_textHpo_textHpo.P1 < df_mf_textHpo_textHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_textHpo_textHpo.P1[i], df_mf_textHpo_textHpo.P2[i]) for i in np.arange(df_mf_textHpo_textHpo.shape[0])])
df_mf_textHpo_textHpo = df_mf_textHpo_textHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_textHpo_textHpo.csv')
df_mf_textHpo_textHpo.sort_values(by='mf', ascending=False).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
12173,HP:0001892,HP:0011028,0.706285,Abnormal bleeding,Abnormality of blood circulation
5481,HP:0011947,HP:0012649,0.625522,Respiratory tract infection,Increased inflammatory response
5480,HP:0011947,HP:0012647,0.625481,Respiratory tract infection,Abnormal inflammatory response
3229,HP:0010978,HP:0011947,0.565861,Abnormality of immune system physiology,Respiratory tract infection
11385,HP:0011024,HP:0025033,0.543392,Abnormality of the gastrointestinal tract,Abnormality of digestive system morphology
2329,HP:0002715,HP:0011947,0.481571,Abnormality of the immune system,Respiratory tract infection
1565,HP:0000969,HP:0002103,0.455301,Edema,Abnormality of the pleura
2876,HP:0002103,HP:0011032,0.451888,Abnormality of the pleura,Abnormality of fluid regulation
12690,HP:0011029,HP:0100659,0.450224,Internal hemorrhage,Abnormality of the cerebral vasculature
12524,HP:0011028,HP:0100659,0.447784,Abnormality of blood circulation,Abnormality of the cerebral vasculature


## LabHpo -- LabHpo
Their mutual information tells how much they correlate with each other.

In [115]:
df_mf_labHpo_labHpo = mf_labHpo_labHpo.mf_labeled()
# add labels
df_mf_labHpo_labHpo['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P1])
df_mf_labHpo_labHpo['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_labHpo_labHpo.P2])
# add p values if they exist

# P1-P2 and P2-P1 are identifical, so remove one
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[df_mf_labHpo_labHpo.P1 < df_mf_labHpo_labHpo.P2, :].reset_index(drop=True)

# remove directly dependent terms 
mask = np.array([hpo.has_dependency(df_mf_labHpo_labHpo.P1[i], df_mf_labHpo_labHpo.P2[i]) for i in np.arange(df_mf_labHpo_labHpo.shape[0])])
df_mf_labHpo_labHpo = df_mf_labHpo_labHpo.loc[np.logical_not(mask), :].reset_index(drop=True)
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).to_csv('../../../data/mf_regardless_of_diseases/mf_labHpo_labHpo.csv')
df_mf_labHpo_labHpo.sort_values(by='mf', ascending=False).reset_index(drop=True).head(n=50)

Unnamed: 0,P1,P2,mf,P1_label,P2_label
0,HP:0002157,HP:0031970,0.744493,Azotemia,Abnormal blood urea nitrogen concentration
1,HP:0020061,HP:0031850,0.575781,Abnormal hemoglobin concentration,Abnormal hematocrit
2,HP:0020062,HP:0031851,0.546701,Decreased hemoglobin concentration,Reduced hematocrit
3,HP:0020058,HP:0020061,0.543036,Abnormal red blood cell count,Abnormal hemoglobin concentration
4,HP:0020058,HP:0031850,0.533428,Abnormal red blood cell count,Abnormal hematocrit
5,HP:0020060,HP:0020062,0.514094,Decreased red blood cell count,Decreased hemoglobin concentration
6,HP:0500164,HP:0500165,0.499272,Abnormal blood carbon dioxide level,Abnormal blood oxygen level
7,HP:0020060,HP:0031851,0.48535,Decreased red blood cell count,Reduced hematocrit
8,HP:0020061,HP:0031851,0.433088,Abnormal hemoglobin concentration,Reduced hematocrit
9,HP:0001882,HP:0004332,0.417853,Leukopenia,Abnormal lymphocyte morphology


## Mutual information between textHpo and labHpo in respect to diagnoses

At each admission, patients could receive multiple diagnosis codes. One of them is designated as "primary" (in MIMIC, it has a rank of 1) and others secondary (rank 2, 3...). Therefore, the analysis was run under two scenerios: 
1. Only primary diagnosis is considered. 
2. All diagnoses are considered equally. 

Under the first scenerio, a patient is considered to be a case only if the corresponding billing code is listed as "primary". While in the second case, a patient is considered to be a case when the corresponding billing code is listed as primary or secondary.   

In [None]:
# define a function that takes in MutualInfoXYz instance, p values and return nicely formated dataframes 
def render_dataframes():
    pass


# There will be several dataframes, one of which is the master  



# also call Java program to render a nice html file
def render_html():
    pass

In [4]:
def mf_dataframes(mf_diagnosis_phenotypes, p_mf_Xz, p_mf_Yz, p_mf_XY_z, p_mf_XY_given_z, p_synergy, p_mf_XY_omit_z):
    X_labels, Y_labels = mf_diagnosis_phenotypes.vars_labels.values()
    M1 = len(X_labels)
    M2 = len(Y_labels)

    mf_Xz = mf_diagnosis_phenotypes.mutual_info_Xz()
    mf_Yz = mf_diagnosis_phenotypes.mutual_info_Yz()

    # mutual information between single phenotypes and diagnosis
    df_mf_Xz = pd.DataFrame(data={'X': X_labels, 'mf_Xz': mf_Xz})
    df_mf_Yz = pd.DataFrame(data={'Y': Y_labels, 'mf_Yz': mf_Yz})
    
    # joint and conditional mutual information, and synergy
    mf_XY_z = mf_diagnosis_phenotypes.mutual_info_XY_z()
    mf_XY_given_z = mf_diagnosis_phenotypes.mutual_info_XY_given_z()
    mf_synergy = mf_diagnosis_phenotypes.synergy_XY2z()
    
    # mutual information between phenotypes without considering diagnosis
    mf_XY_omit_z = mf_diagnosis_phenotypes.mutual_info_XY_omit_z()
    
    # mutual information between phenotype pairs and diagnosis
    df_mf_XY_z = pd.DataFrame()
    df_mf_XY_z['X'] = np.repeat(X_labels, M2)
    df_mf_XY_z['Y'] = np.tile(Y_labels, [M1])
    df_mf_XY_z['mf_XY_z'] = mf_XY_z.flat
    df_mf_XY_z['mf_XY_given_z'] = mf_XY_given_z.flat
    df_mf_XY_z['synergy'] = mf_synergy.flat
    
    # mutual information between phenotypes after omiting diagnosis
    df_mf_XY_z['mf_XY_omit_z'] = mf_XY_omit_z.flat
    
    # add p values; otherwise, assign -1
    df_mf_Xz['p_mf_Xz'] = p_mf_Xz if p_mf_Xz is not None else np.repeat(-1, M1*M2)
    df_mf_Yz['p_mf_Yz'] = p_mf_Yz if p_mf_Yz is not None else np.repeat(-1, M1*M2)
    df_mf_XY_z['p_mf_XY_z'] = p_mf_XY_z.flat if p_mf_XY_z is not None else np.repeat(-1, M1*M2)
    df_mf_XY_z['p_mf_XY_given_z'] = p_mf_XY_given_z.flat if p_mf_XY_given_z is not None else np.repeat(-1, M1*M2)
    df_mf_XY_z['p_synergy'] = p_synergy.flat if p_synergy is not None else np.repeat(-1, M1*M2)
    df_mf_XY_z['p_mf_XY_omit_z'] = p_mf_XY_omit_z.flat if p_mf_XY_omit_z is not None else np.repeat(-1, M1*M2)


    return df_mf_Xz, df_mf_Yz, df_mf_XY_z

def filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z):
    df_merged = df_mf_XY_z \
        .merge(df_mf_Xz, how='left', on=['X']) \
        .merge(df_mf_Yz, how='left', on=['Y'])

    ## filter out identifical pairs: a, b is the same as b, a
    df_filtered = df_merged.loc[df_merged.X < df_merged.Y, :].reset_index(drop=True)
    mask = np.array([hpo.has_dependency(df_filtered.X[i], df_filtered.Y[i]) for i in np.arange(len(df_filtered))])
    df_filtered = df_filtered.loc[np.logical_not(mask), ].reset_index(drop=True)
    return df_filtered

def entropy(case, control):
    total = case + control 
    h = -(case / total * np.log2(case/total) + control/total * np.log2(control/total))
    return h
    

def load_p_values(path):
    with open(path, 'rb') as f:
        p = pickle.load(f)
    return p

convert_to_percent = np.vectorize(lambda x: ' {:.2f}%'.format(x * 100))

In [5]:
primary_only = True
if primary_only:
    diag_dir = "primary_only"
else:
    diag_dir = "primary_and_secondary"

    
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_textHpo_labHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_textHpo_labHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_textHpo_textHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_textHpo_textHpo = pickle.load(f)
with open('../../../data/mf_regarding_diseases/{}/summaries_diagnosis_labHpo_labHpo.obj'.format(diag_dir), 'rb') as f:
    summaries_diagnosis_labHpo_labHpo = pickle.load(f)

In [6]:
summaries_diagnosis_textHpo_labHpo.keys()

dict_keys(['428', '584', '038', '493'])

### textHpo-labHpo pairs

In [7]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_textHpo_labHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [8]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,+--,-++,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz
0,HP:0002086,HP:0032180,0.000589,0.109064,0.000331,0.108733,0.0,0.0,0.0,0.0,...,4055,16,5178,12,5193,58976,4e-05,0.0305,0.000217,0.0
1,HP:0002086,HP:0012337,0.000166,0.110932,1e-06,0.110931,0.001188,0.0,0.008813,0.0,...,6512,19,4203,9,6168,58976,4e-05,0.0305,0.000125,0.00075
2,HP:0002086,HP:0020058,0.000781,0.102745,0.000312,0.102433,0.0,0.0,0.0,0.0,...,8833,11,3673,17,6698,58976,4e-05,0.0305,0.000428,0.0
3,HP:0002086,HP:0003111,0.000742,0.099781,0.000309,0.099472,0.0,0.0,0.0,0.0,...,7950,12,4014,16,6357,58976,4e-05,0.0305,0.000393,0.0
4,HP:0002086,HP:0031850,0.001024,0.102712,0.000298,0.102414,0.0,0.0,0.0,0.0,...,8614,7,3740,21,6631,58976,4e-05,0.0305,0.000686,0.0


In [9]:
p_values.keys()

dict_keys(['mf_Xz', 'mf_Yz', 'mf_XY_z', 'mf_XY_given_z', 'synergy', 'mf_XY_omit_z'])

In [10]:
# mutual information between textHpo and diagnosis
df_mf_textHpo_diagnosis = df_mf_Xz \
    .assign(X_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Xz.X])) \
    .sort_values(by='mf_Xz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P', 'mf_Xz': 'mf_P_diag', 'p_mf_Xz': 'p_mf_P_diag', 'X_label': 'P_label'})

df_mf_textHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/{}/mf_textHpo_diag_{}.csv'.format(diag_dir,disease))
df_mf_textHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0002202,0.000951,0.0,Pleural effusion
1,HP:0002098,0.000842,0.0,Respiratory distress
2,HP:0100598,0.000388,0.0,Pulmonary edema
3,HP:0000001,0.00029,0.0,All
4,HP:0000118,0.00029,0.0,Phenotypic abnormality


In [11]:
# mutual information between labHpo and diagnosis
df_mf_labHpo_diagnosis = df_mf_Yz \
    .assign(Y_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_Yz.Y])) \
    .sort_values(by='mf_Yz', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'Y': 'P', 'mf_Yz': 'mf_P_diag', 'p_mf_Yz': 'p_mf_P_diag', 'Y_label': 'P_label'})

df_mf_labHpo_diagnosis.to_csv('../../../data/mf_regarding_diseases/{}/mf_labHpo_diag_{}.csv'.format(diag_dir, disease))
df_mf_labHpo_diagnosis.head()

Unnamed: 0,P,mf_P_diag,p_mf_P_diag,P_label
0,HP:0004363,0.001605,0.0,Abnormal circulating calcium concentration
1,HP:0002901,0.001551,0.0,Hypocalcemia
2,HP:0010927,0.001442,0.0,Abnormal blood inorganic cation concentration
3,HP:0031851,0.001371,0.0,Reduced hematocrit
4,HP:0020062,0.001325,0.0,Decreased hemoglobin concentration


In [12]:
# save synergies
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.rename(columns={'X':'P1', 'Y': 'P2'}) \
    .sort_values(by=['synergy'], ascending=False) \
    .reset_index(drop=True) \
    .loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'synergy', 'p_mf_Xz', 'p_mf_Yz', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .rename(columns={'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [13]:
# save ratios between conditional mutual info and overall mutual info
df_mf_vs_conditional_mf = df_mf_XY_z_filtered \
    .assign(mf_ratio=df_mf_XY_z_filtered.mf_XY_given_z/df_mf_XY_z_filtered.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2'})
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_textHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head(n=10)

Unnamed: 0,P1,P2,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0002098,HP:0002901,0.00274,0.008821,0.000348,0.008473,0.0,0.0,0.0,0.0,...,111,26054,58976,0.000842,0.0,0.001551,0.0,Respiratory distress,Hypocalcemia,1.041025
1,HP:0002098,HP:0004363,0.002792,0.008924,0.000345,0.008579,0.0,0.0,0.0,0.0,...,111,25130,58976,0.000842,0.0,0.001605,0.0,Respiratory distress,Abnormal circulating calcium concentration,1.040237
2,HP:0002098,HP:0031851,0.002549,0.008998,0.000337,0.008662,0.0,0.0,0.0,0.0,...,97,19205,58976,0.000842,0.0,0.001371,0.0,Respiratory distress,Reduced hematocrit,1.038855
3,HP:0002098,HP:0010927,0.002618,0.009112,0.000334,0.008778,0.0,0.0,0.0,0.0,...,106,23166,58976,0.000842,0.0,0.001442,0.0,Respiratory distress,Abnormal blood inorganic cation concentration,1.038095
4,HP:0002098,HP:0020062,0.002493,0.00959,0.000326,0.009264,0.0,0.0,0.0,0.0,...,106,23712,58976,0.000842,0.0,0.001325,0.0,Respiratory distress,Decreased hemoglobin concentration,1.035216
5,HP:0002098,HP:0010929,0.002477,0.01222,0.000412,0.011807,0.0,0.0,0.0,0.0,...,89,17415,58976,0.000842,0.0,0.001223,0.0,Respiratory distress,Abnormal blood cation concentration,1.034901
6,HP:0002098,HP:0020058,0.001546,0.008256,0.000276,0.00798,0.0,0.0,0.0,0.0,...,57,13982,58976,0.000842,0.0,0.000428,0.0,Respiratory distress,Abnormal red blood cell count,1.034603
7,HP:0002098,HP:0020060,0.002319,0.010699,0.000334,0.010365,0.0,0.0,0.0,0.0,...,106,25875,58976,0.000842,0.0,0.001144,0.0,Respiratory distress,Decreased red blood cell count,1.03221
8,HP:0002098,HP:0031850,0.001787,0.008437,0.000259,0.008178,0.0,0.0,0.0,0.0,...,68,13745,58976,0.000842,0.0,0.000686,0.0,Respiratory distress,Abnormal hematocrit,1.031725
9,HP:0002098,HP:0011873,0.002157,0.007831,0.000234,0.007597,0.0,0.0,0.0,0.0,...,119,32883,58976,0.000842,0.0,0.001081,0.0,Respiratory distress,Abnormal platelet count,1.030844


In [14]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)


# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [15]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label]),
                          'type': np.repeat(['Rad', 'Lab'], len(df_4_cytoscape))}).drop_duplicates()
nodes.drop_duplicates().reset_index(drop=True).to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_textHpo_labHpo_{}.csv'.format(diag_dir, disease))

### labHpo-labHpo pairs

In [16]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_labHpo_labHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_labHpo_labHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_labHpo_labHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [17]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
#df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True)
    #.rename(columns={'mf_joint': 'mf_P1P2_diag'}) \
    #.loc[:, ['P1', 'P2', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.loc[:, ['X', 'Y', 'mf_Xz', 'mf_Yz', 'mf_XY_z', 'p_mf_Xz', 'p_mf_Yz', 'P1_label', 'P2_label']] \
    .rename(columns={'X': 'P1', 'Y': 'P2', 'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag'}) \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_joint_vs_individual_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001881,HP:0004363,0.002544,0.124616,0.000749,0.123867,0.0,0.0,0.0,0.0,...,7012,59,18260,58976,0.00019,6.2e-05,0.001605,0.0,Abnormal leukocyte morphology,Abnormal circulating calcium concentration
1,HP:0004363,HP:0010987,0.002544,0.124616,0.000749,0.123867,0.0,0.0,0.0,0.0,...,10331,59,18260,58976,0.001605,0.0,0.00019,0.0,Abnormal circulating calcium concentration,Abnormal cellular immune system morphology
2,HP:0004363,HP:0032251,0.002544,0.124616,0.000749,0.123867,0.0,0.0,0.0,0.0,...,10331,59,18260,58976,0.001605,0.0,0.00019,0.0,Abnormal circulating calcium concentration,Abnormal immune system morphology
3,HP:0004363,HP:0011893,0.002544,0.124616,0.000749,0.123867,0.0,0.0,0.0,0.0,...,10331,59,18260,58976,0.001605,0.0,0.00019,0.0,Abnormal circulating calcium concentration,Abnormal leukocyte count
4,HP:0002715,HP:0004363,0.002535,0.124723,0.000744,0.123978,0.0,0.0,0.0,0.0,...,6970,59,18220,58976,0.000186,0.0,0.001605,0.0,Abnormality of the immune system,Abnormal circulating calcium concentration


In [18]:
entropy(mf_diagnosis_phenotypes.case_N, mf_diagnosis_phenotypes.control_N) 

0.03464018958189605

In [19]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2','mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
247,HP:0010929,HP:0012337,0.002373,0.283079,0.001025,0.282054,0.0,0.0,0.0,0.0,...,7986,27,11294,58976,0.001223,0.0,0.000125,0.000625,Abnormal blood cation concentration,Abnormal homeostasis
358,HP:0001881,HP:0010929,0.002355,0.138945,0.000942,0.138003,0.0,0.0,0.0,0.0,...,11038,51,14234,58976,0.00019,6.2e-05,0.001223,0.0,Abnormal leukocyte morphology,Abnormal blood cation concentration
256,HP:0010929,HP:0032251,0.002355,0.138945,0.000942,0.138003,0.0,0.0,0.0,0.0,...,5046,51,14234,58976,0.001223,0.0,0.00019,0.0,Abnormal blood cation concentration,Abnormal immune system morphology
254,HP:0010929,HP:0011893,0.002355,0.138945,0.000942,0.138003,0.0,0.0,0.0,0.0,...,5046,51,14234,58976,0.001223,0.0,0.00019,0.0,Abnormal blood cation concentration,Abnormal leukocyte count
255,HP:0010929,HP:0010987,0.002355,0.138945,0.000942,0.138003,0.0,0.0,0.0,0.0,...,5046,51,14234,58976,0.001223,0.0,0.00019,0.0,Abnormal blood cation concentration,Abnormal cellular immune system morphology
293,HP:0002715,HP:0010929,0.002348,0.139407,0.000938,0.138469,0.0,0.0,0.0,0.0,...,10974,51,14216,58976,0.000186,0.0,0.001223,0.0,Abnormality of the immune system,Abnormal blood cation concentration
141,HP:0003111,HP:0012337,0.001446,0.31285,0.000929,0.311922,0.0,0.0,0.0,0.0,...,4033,22,10274,58976,0.000393,0.0,0.000125,0.000625,Abnormal blood ion concentration,Abnormal homeostasis
109,HP:0012337,HP:0031850,0.001687,0.256094,0.000876,0.255218,0.0,0.0,0.0,0.0,...,2772,24,9908,58976,0.000125,0.000812,0.000686,0.0,Abnormal homeostasis,Abnormal hematocrit
107,HP:0012337,HP:0032180,0.001175,0.264718,0.000832,0.263886,0.0,0.0,0.0,0.0,...,4851,20,7829,58976,0.000125,0.000812,0.000217,0.0,Abnormal homeostasis,Abnormal circulating metabolite concentration
839,HP:0001974,HP:0010929,0.002136,0.121237,0.000825,0.120412,0.0,0.0,0.0,0.0,...,17621,80,16555,58976,8.7e-05,0.002313,0.001223,0.0,Leukocytosis,Abnormal blood cation concentration


In [20]:
df_mf_XY_z_filtered.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
0,HP:0001871,HP:0001939,0.000312,0.078676,0.0001395103,0.078537,0.0,0.0,0.000313,0.0,...,575,6,1869,58976,3.2e-05,0.03775,0.00014,0.00025,Abnormality of blood and blood-forming tissues,Abnormality of metabolism/homeostasis
1,HP:0001871,HP:0032180,0.000218,0.071832,-3.087502e-05,0.071863,0.0,0.0,0.001375,0.0,...,409,13,2035,58976,3.2e-05,0.03775,0.000217,0.0,Abnormality of blood and blood-forming tissues,Abnormal circulating metabolite concentration
2,HP:0001871,HP:0012337,0.000288,0.072915,0.0001317287,0.072783,0.0,0.0,0.000625,0.0,...,197,9,2247,58976,3.2e-05,0.03775,0.000125,0.000625,Abnormality of blood and blood-forming tissues,Abnormal homeostasis
3,HP:0001871,HP:0003111,0.000405,0.072442,-1.981089e-05,0.072462,0.0,0.0,0.001437,0.0,...,128,13,2316,58976,3.2e-05,0.03775,0.000393,0.0,Abnormality of blood and blood-forming tissues,Abnormal blood ion concentration
4,HP:0001871,HP:0011014,4e-05,0.064398,9.838793e-08,0.064398,0.231,0.0,0.913688,0.0,...,80,13,2364,58976,3.2e-05,0.03775,8e-06,0.400062,Abnormality of blood and blood-forming tissues,Abnormal glucose homeostasis


In [21]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_labHpo_labHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0020059,HP:0020060,0.001182,0.000154,3.7e-05,0.000117,0.0,0.000188,0.000188,0.001313,...,105,17221,58976,1e-06,0.048625,0.001144,0.0,Increased red blood cell count,Decreased red blood cell count,1.319077
1,HP:0001877,HP:0001974,0.001296,0.032879,0.000592,0.032287,0.0,0.0,0.0,0.0,...,26,3374,58976,0.000616,0.0,8.7e-05,0.002563,Abnormal erythrocyte morphology,Leukocytosis,1.018345
2,HP:0001877,HP:0011893,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal leukocyte count,1.015887
3,HP:0001877,HP:0001881,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal leukocyte morphology,1.015887
4,HP:0001877,HP:0010987,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal cellular immune system morphology,1.015887


In [22]:
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Lab_' + x['P1']) \
    .assign(P2 = lambda x: 'Lab_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_labHpo_labHpo_{}.csv'.format(diag_dir, disease))

In [23]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Lab', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_labHpo_labHpo_{}.csv'.format(diag_dir, disease))

### textHpo-textHpo pairs

In [24]:
disease = '493'
mf_diagnosis_phenotypes = mf.MutualInfoXYz(summaries_diagnosis_textHpo_textHpo[disease])
p_values = load_p_values('../../../data/mf_regarding_diseases/{}/{}/p_value_textHpo_textHpo_{}_{}.obj'.format(diag_dir, disease, disease, diag_dir))

df_mf_Xz, df_mf_Yz, df_mf_XY_z = mf_dataframes(mf_diagnosis_phenotypes, p_values['mf_Xz'], p_values['mf_Yz'], \
                                               p_values['mf_XY_z'], p_values['mf_XY_given_z'], p_values['synergy'], \
                                              p_values['mf_XY_omit_z'])
s = summaries_diagnosis_textHpo_textHpo[disease].m2.reshape([-1, 8]).astype(int)
s_sum = np.sum(s, axis=-1)
#s = np.core.defchararray.add(s.astype(str), convert_to_percent(s / np.sum(s, axis=-1).reshape([-1, 1])))
s = pd.DataFrame(data = s, columns=['+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---'])
# add an extra columns for the sum
s['sum'] = s_sum
df_mf_XY_z = pd.concat([df_mf_XY_z, s], axis=1)
df_mf_XY_z_filtered = filter_df(df_mf_Xz, df_mf_Yz, df_mf_XY_z)

In [25]:
df_mf_XY_z_filtered['P1_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])
df_mf_XY_z_filtered['P2_label'] = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])
df_mf_XY_z_filtered.sort_values(by='mf_XY_z', ascending=False).head(n=20)
df_mf_joint_vs_individual = df_mf_XY_z_filtered \
    .sort_values(by='mf_XY_z', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'mf_P1P2_diag','p_mf_P1_diag', 'p_mf_P2_diag', 'p_mf_P1P2_diag', 'P1_label', 'P2_label']]

df_mf_joint_vs_individual.to_csv('../../../data/mf_regarding_diseases/{}/df_mf_joint_vs_individual_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_joint_vs_individual.head()

Unnamed: 0,P1,P2,mf_P1_diag,mf_P2_diag,mf_P1P2_diag,p_mf_P1_diag,p_mf_P2_diag,p_mf_P1P2_diag,P1_label,P2_label
0,HP:0002098,HP:0002202,0.000842,0.000951,0.002601,0.0,0.0,0.0,Respiratory distress,Pleural effusion
1,HP:0002098,HP:0100598,0.000842,0.000388,0.001739,0.0,0.0,0.0,Respiratory distress,Pulmonary edema
2,HP:0001640,HP:0002098,0.000266,0.000842,0.001451,0.0,0.0,0.0,Cardiomegaly,Respiratory distress
3,HP:0002098,HP:0100750,0.000842,0.000212,0.001416,0.0,0.0,0.0,Respiratory distress,Atelectasis
4,HP:0000969,HP:0002098,9.8e-05,0.000842,0.001279,0.001,0.0,0.0,Edema,Respiratory distress


In [26]:
df_mf_XY_z_filtered \
    .sort_values(by='synergy', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1_diag', 'mf_P2_diag', 'synergy', 'p_mf_P1_diag', 'p_mf_P2_diag', 'p_synergy', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_synergy_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_XY_z_filtered.sort_values(by='synergy', ascending=False).head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,-+-,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label
2201,HP:0002098,HP:0002202,0.002601,0.054547,0.000809,0.053738,0.0,0.0,0.0,0.0,...,16387,118,32793,58976,0.000842,0.0,0.000951,0.0,Respiratory distress,Pleural effusion
2222,HP:0002098,HP:0100598,0.001739,0.061144,0.000509,0.060635,0.0,0.0,0.0,0.0,...,7198,125,41982,58976,0.000842,0.0,0.000388,0.0,Respiratory distress,Pulmonary edema
2197,HP:0002098,HP:0100750,0.001416,0.042051,0.000362,0.041689,0.0,0.0,0.0,0.0,...,20170,98,29010,58976,0.000842,0.0,0.000212,0.0,Respiratory distress,Atelectasis
1784,HP:0001640,HP:0002098,0.001451,0.038767,0.000344,0.038423,0.0,0.0,0.0,0.0,...,4817,117,38935,58976,0.000266,0.0,0.000842,0.0,Cardiomegaly,Respiratory distress
294,HP:0000969,HP:0002098,0.001279,0.030166,0.00034,0.029826,0.0,0.0,0.0,0.0,...,1325,61,18978,58976,9.8e-05,0.001,0.000842,0.0,Edema,Respiratory distress
582,HP:0000924,HP:0002098,0.001212,0.019054,0.000267,0.018788,0.0,0.0,0.0,0.0,...,3205,79,27214,58976,0.000104,0.001313,0.000842,0.0,Abnormality of the skeletal system,Respiratory distress
2188,HP:0002098,HP:0011032,0.001111,0.019163,0.00025,0.018913,0.0,0.0,0.0,0.0,...,30523,49,18657,58976,0.000842,0.0,1.9e-05,0.20475,Respiratory distress,Abnormality of fluid regulation
2200,HP:0002098,HP:0011842,0.001219,0.016998,0.000236,0.016761,0.0,0.0,0.0,0.0,...,17351,93,31829,58976,0.000842,0.0,0.000141,0.0,Respiratory distress,Abnormality of skeletal morphology
403,HP:0002597,HP:0012252,0.0003,0.068212,0.000227,0.067985,0.0,0.0,0.0,0.0,...,18216,22,9471,58976,6e-05,0.005313,1.3e-05,0.963,Abnormality of the vasculature,Abnormal respiratory system morphology
2194,HP:0002098,HP:0010978,0.001111,0.019828,0.000217,0.019611,0.0,0.0,0.0,0.0,...,22103,76,27077,58976,0.000842,0.0,5.1e-05,0.013125,Respiratory distress,Abnormality of immune system physiology


In [27]:
df_mf_vs_conditional_mf.head()

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0020059,HP:0020060,0.001182,0.000154,3.7e-05,0.000117,0.0,0.000188,0.000188,0.001313,...,105,17221,58976,1e-06,0.048625,0.001144,0.0,Increased red blood cell count,Decreased red blood cell count,1.319077
1,HP:0001877,HP:0001974,0.001296,0.032879,0.000592,0.032287,0.0,0.0,0.0,0.0,...,26,3374,58976,0.000616,0.0,8.7e-05,0.002563,Abnormal erythrocyte morphology,Leukocytosis,1.018345
2,HP:0001877,HP:0011893,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal leukocyte count,1.015887
3,HP:0001877,HP:0001881,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal leukocyte morphology,1.015887
4,HP:0001877,HP:0010987,0.001471,0.042521,0.000665,0.041856,0.0,0.0,0.0,0.0,...,18,3153,58976,0.000616,0.0,0.00019,0.0,Abnormal erythrocyte morphology,Abnormal cellular immune system morphology,1.015887


In [28]:
df_mf_vs_conditional_mf = df_mf_XY_z_filtered 
df_mf_vs_conditional_mf = df_mf_vs_conditional_mf \
    .assign(mf_ratio=df_mf_vs_conditional_mf.mf_XY_given_z/df_mf_vs_conditional_mf.mf_XY_omit_z) \
    .sort_values(by='mf_ratio', ascending=False) \
    .reset_index(drop=True) 
df_mf_vs_conditional_mf \
    .rename(columns={'X': 'P1', 'Y': 'P2', 
                     'mf_Xz': 'mf_P1_diag', 'mf_Yz': 'mf_P2_diag', 'mf_XY_z': 'mf_P1P2_diag', 'mf_XY_given_z': 'mf_P1P2_given_diag',
                    'p_mf_Xz': 'p_mf_P1_diag', 'p_mf_Yz': 'p_mf_P2_diag', 'p_mf_XY_z': 'p_mf_P1P2_diag', 'p_mf_XY_given_z': 'p_mf_P1P2_given_diag', 
                    'mf_XY_omit_z': 'mf_P1P2_omit_diag', 'p_mf_XY_omit_z': 'p_mf_P1P2_omit_diag'}) \
    .loc[:, ['P1', 'P2', 'mf_P1P2_omit_diag', 'mf_P1P2_given_diag', 'p_mf_P1P2_omit_diag', 'p_mf_P1P2_given_diag', 'mf_ratio', 'P1_label', 'P2_label', '+++', '++-', '+-+', '+--', '-++', '-+-', '--+', '---', 'sum', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/df_mf_ratio_textHpo_textHpo_{}.csv'.format(diag_dir, disease))
df_mf_vs_conditional_mf.head(n=20)

Unnamed: 0,X,Y,mf_XY_z,mf_XY_given_z,synergy,mf_XY_omit_z,p_mf_XY_z,p_mf_XY_given_z,p_synergy,p_mf_XY_omit_z,...,--+,---,sum,mf_Xz,p_mf_Xz,mf_Yz,p_mf_Yz,P1_label,P2_label,mf_ratio
0,HP:0001945,HP:0002107,0.000536,0.00403,0.000115,0.003915,0.0,0.0,0.0,0.0,...,191,41867,58976,0.000213,6.2e-05,0.000209,0.0,Fever,Pneumothorax,1.02943
1,HP:0002098,HP:0002202,0.002601,0.054547,0.000809,0.053738,0.0,0.0,0.0,0.0,...,118,32793,58976,0.000842,0.0,0.000951,0.0,Respiratory distress,Pleural effusion,1.015048
2,HP:0002098,HP:0002107,0.00115,0.006906,0.0001,0.006806,0.0,0.0,0.0,0.0,...,120,41593,58976,0.000842,0.0,0.000209,0.0,Respiratory distress,Pneumothorax,1.014712
3,HP:0000924,HP:0002098,0.001212,0.019054,0.000267,0.018788,0.0,0.0,0.0,0.0,...,79,27214,58976,0.000104,0.001313,0.000842,0.0,Abnormality of the skeletal system,Respiratory distress,1.014192
4,HP:0002098,HP:0011842,0.001219,0.016998,0.000236,0.016761,0.0,0.0,0.0,0.0,...,93,31829,58976,0.000842,0.0,0.000141,0.0,Respiratory distress,Abnormality of skeletal morphology,1.014109
5,HP:0002098,HP:0002617,0.001184,0.008634,0.000116,0.008518,0.0,0.0,0.0,0.0,...,121,41769,58976,0.000842,0.0,0.000227,6.2e-05,Respiratory distress,Dilatation,1.01364
6,HP:0001392,HP:0002098,0.001101,0.013012,0.000172,0.012839,0.0,0.0,0.0,0.0,...,103,37247,58976,8.7e-05,0.002812,0.000842,0.0,Abnormality of the liver,Respiratory distress,1.013417
7,HP:0000765,HP:0002098,0.001106,0.010726,0.00014,0.010586,0.0,0.0,0.0,0.0,...,117,42454,58976,0.000124,0.000375,0.000842,0.0,Abnormality of the thorax,Respiratory distress,1.013269
8,HP:0002098,HP:0011032,0.001111,0.019163,0.00025,0.018913,0.0,0.0,0.0,0.0,...,49,18657,58976,0.000842,0.0,1.9e-05,0.20475,Respiratory distress,Abnormality of fluid regulation,1.013206
9,HP:0001438,HP:0002098,0.001047,0.009842,0.000118,0.009724,0.0,0.0,0.0,0.0,...,115,41953,58976,8.8e-05,0.00325,0.000842,0.0,Abnormality of abdomen morphology,Respiratory distress,1.0121


In [29]:
# save data for cytoscape
percentile = 0.01
n = math.floor(len(df_mf_XY_z_filtered) * percentile)

df_4_cytoscape = df_mf_XY_z_filtered \
    .rename(columns={'X':'P1', 'Y': 'P2'}) \
    .assign(P1_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.X])) \
    .assign(P2_label = np.array([hpo.term_id2name_map().get(termid) for termid in df_mf_XY_z_filtered.Y])) \
    .sort_values(by='synergy', ascending=False) \
    .assign(P1 = lambda x: 'Rad_' + x['P1']) \
    .assign(P2 = lambda x: 'Rad_' + x['P2']) \
    .head(n = n)

# edges
df_4_cytoscape \
    .loc[:, ['P1', 'P2', 'synergy', 'p_synergy']] \
    .to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/edges_textHpo_textHpo_{}.csv'.format(diag_dir, disease))

In [30]:
# nodes
nodes = pd.DataFrame(data={'term_id': np.concatenate([df_4_cytoscape.P1, df_4_cytoscape.P2]), 
                           'term_label': np.concatenate([df_4_cytoscape.P1_label, df_4_cytoscape.P2_label])}).drop_duplicates()
nodes['type'] = np.repeat('Rad', len(nodes))
nodes.to_csv('../../../data/mf_regarding_diseases/{}/cytoscape/nodes_textHpo_textHpo_{}.csv'.format(diag_dir, disease))