# Multiomics BMI Paper — ∆BMI-based Misclassification and Hierarchical Clustering

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) visualized ∆BMI correlations across omics categories and ∆BMI-based misclassification (in the baseline Arivale cohort). Also, this notebook performed hierarchical clustering to reveal the underlying differences in ∆BMI misclassification, which was removed from the final paper but included in the original preprint.  

Input files:  
* Arivale baseline BMI predictions: 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-BothSex.tsv  
* Arivale baseline covariates and blood omics (preprocessed): 210104_Biological-BMI-paper_RF-imputation_baseline-\[metDF/protDF/combiDF\]-with-RF-imputation.tsv  
* OLS linear regressions of the retained variables on BMI: 220802_Multiomics-BMI-NatMed1stRevision_BMI-LASSO-bcoef_\[MetBMI/ProtBMI\]-BothSex-OLS.tsv  

Output figures and tables:  
* Figure 3a, 3c  
* Tables for Supplementary Data 10  
* Intermediate table for other notebooks (biological BMI summary)  
* (Figure 3d, 3e and Supplementary Figure 5b, 5c in the original preprint)  

Original notebook (memo for my future tracing):  
* dalek:\[JupyterLab HOME\]/220621_Multiomics-BMI-NatMedRevision/220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time

from decimal import Decimal, ROUND_HALF_UP
import sys
from sklearn.preprocessing import StandardScaler
from scipy.cluster import hierarchy
from statsmodels.stats import multitest as multi
import matplotlib.patches as mpatches

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

## 1. Obesity classification

### 1-1. Clean the biological BMI dataframes

In [None]:
#Import and merge BMI and biological BMI
fileDir = './ExportData/'
ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
tempL1 = ['log_BaseBMI', 'BaseBMI', 'Testing']
tempL2 = ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']
for bbmi in tempL2:
    fileName = bbmi+'-BothSex.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id':str})
    tempDF1 = tempDF1.set_index('public_client_id')
    #Put aside common part
    if bbmi==tempL2[0]:
        tempDF = tempDF1[tempL1]
    tempDF1 = tempDF1.drop(columns=tempL1)
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')

display(tempDF)
tempDF1 = tempDF.describe(include=[np.number])
tempDF1.loc['Skewness'] = stats.skew(tempDF.select_dtypes(include=[np.number]))
display(tempDF1)

tempD = {'BMI':'k', 'MetBMI':'b', 'ProtBMI':'r', 'ChemBMI':'g', 'CombiBMI':'m'}
for scale in ['log', 'raw']:
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    for bbmi in tempD.keys():
        if scale=='raw':
            sns.distplot(tempDF['Base'+bbmi], label=bbmi, color=tempD[bbmi])
        elif scale=='log':
            sns.distplot(tempDF['log_Base'+bbmi], label=bbmi, color=tempD[bbmi])
    sns.despine()
    plt.ylabel('Density')
    if scale=='raw':
        plt.xlabel('BMI [kg/m'+r'$^2$'+']')
    elif scale=='log':
        plt.xlabel('BMI [kg/m'+r'$^2$'+'] (log-scale)')
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()

bmiDF = tempDF

### 1-2. Obesity classification

In [None]:
#Obesity classification
tempD = {'BMI':'k', 'MetBMI':'b', 'ProtBMI':'r', 'ChemBMI':'g', 'CombiBMI':'m'}
for bmi in tempD.keys():
    tempL = []
    for value in bmiDF['Base'+bmi].tolist():
        if np.isnan(value):
            tempL.append('NotCalculated')
        elif value < 18.5:
            tempL.append('Underweight')
        elif value < 25:
            tempL.append('Normal')
        elif value < 30:
            tempL.append('Overweight')
        elif value >= 30:
            tempL.append('Obese')
        else:#Just in case
            tempL.append('Error?')
    bmiDF['Base'+bmi+'_class'] = tempL
    #Confirmation
    print('Base'+bmi+'_class:')
    tempS1 = bmiDF['Base'+bmi+'_class'].value_counts()
    tempDF = pd.DataFrame({'Count':tempS1, 'Percentage':tempS1/len(bmiDF)*100})
    display(tempDF)
    print('')

#Add the covariates info to make summary dataframe
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_RF-imputation_'
fileName = 'baseline-combiDF-with-RF-imputation.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')
tempL = ['Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']#Without Race in this study
bmiDF = pd.merge(bmiDF, tempDF[tempL], left_index=True, right_index=True, how='inner')
display(bmiDF)

#Save
fileDir = './ExportData/'
ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
fileName = 'biologicalBMI-baseline-summary-BothSex.tsv'
bmiDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

### 1-3. Misclassification

In [None]:
#Calculate misclassification rate based on each biological BMI class
tempL = ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']
misclassDF = pd.DataFrame(index=['Overall_count', 'Overall [%]',
                                 'Underweight_count', 'Underweight [%]',
                                 'Normal_count', 'Normal [%]',
                                 'Overweight_count', 'Overweight [%]',
                                 'Obese_count', 'Obese [%]'])
for bbmi in tempL:
    tempDF = bmiDF.loc[bmiDF['Base'+bbmi+'_class']!='NotCalculated']#just in case
    counter0 = 0
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter4 = 0
    for row_n in tempDF.index.tolist():
        bmi_class = tempDF.loc[row_n, 'BaseBMI_class']
        bbmi_class = tempDF.loc[row_n, 'Base'+bbmi+'_class']
        if bmi_class!=bbmi_class:
            counter0 += 1#Overall
            if bmi_class=='Underweight':
                counter1 += 1
            elif bmi_class=='Normal':
                counter2 += 1
            elif bmi_class=='Overweight':
                counter3 += 1
            elif bmi_class=='Obese':
                counter4 += 1
            else:#Just in case
                print('Error?')
    tempS = pd.Series([counter0, counter0/len(tempDF)*100,
                       counter1, counter1/len(tempDF.loc[tempDF['BaseBMI_class']=='Underweight'])*100,
                       counter2, counter2/len(tempDF.loc[tempDF['BaseBMI_class']=='Normal'])*100,
                       counter3, counter3/len(tempDF.loc[tempDF['BaseBMI_class']=='Overweight'])*100,
                       counter4, counter4/len(tempDF.loc[tempDF['BaseBMI_class']=='Obese'])*100],
                      index=['Overall_count', 'Overall [%]',
                             'Underweight_count', 'Underweight [%]',
                             'Normal_count', 'Normal [%]',
                             'Overweight_count', 'Overweight [%]',
                             'Obese_count', 'Obese [%]'],
                      name='vs. '+bbmi)
    misclassDF = pd.concat([misclassDF, tempS], axis=1)
display(misclassDF)

#Plot (without underweight)
tempD1 = {'vs. MetBMI':'vs. MetBMI class', 'vs. ProtBMI':'vs. ProtBMI class',
          'vs. ChemBMI':'vs. ChemBMI class', 'vs. CombiBMI':'vs. CombiBMI class'}
tempD2 = {'vs. MetBMI class':'b', 'vs. ProtBMI class':'r',
          'vs. ChemBMI class':'g', 'vs. CombiBMI class':'m'}
tempDF = misclassDF.loc[['Overall [%]', 'Normal [%]', 'Overweight [%]', 'Obese [%]']]
tempDF.index = tempDF.index.str.replace(' \[%\]', '')
tempDF.columns = tempDF.columns.map(tempD1)
tempDF = tempDF.reset_index().melt(var_name='Category', value_name='Misclassification', id_vars='index')
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5.5, 3))
sns.barplot(data=tempDF, x='index', y='Misclassification',
            hue='Category', dodge=True, palette=tempD2, edgecolor='black')
sns.despine()
plt.yticks(np.arange(0, 51, 10))
plt.axvline(x=(0+1)/2, **{'linestyle':'--', 'color':'k'})
##Add reference range
plt.axhspan(ymin=28, ymax=48, facecolor='orange', alpha=0.2, zorder=0)
plt.axhline(y=28, **{'linestyle':'-', 'color':'orange', 'zorder':0})
plt.axhline(y=48, **{'linestyle':'-', 'color':'orange', 'zorder':0})
plt.ylabel('Misclassification [%]')
plt.xlabel('BMI class')
#plt.legend(bbox_to_anchor=(0.5, 1), loc='lower center', borderaxespad=0.5,
#           ncol=2, columnspacing=1, handletextpad=0.5)
plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=0.5)
##Save
fileDir = './ExportFigures/'
ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
fileName = 'misclassification-rate.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 1-4. Calculate ∆BMI

> In contrast to biological age, the difference between measured and predicted values are dependent on the absoculte BMI value. Therefore, ∆BMI is defined with the rate of difference: (bBMI - BMI) / BMI.  

In [None]:
#Calculate the rate of difference
tempL = ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']
for bbmi in tempL:
    bmiDF[bbmi+'–BMI'] = (bmiDF['Base'+bbmi] - bmiDF['BaseBMI']) / bmiDF['BaseBMI'] * 100

tempD = {'MetBMI–BMI':'b', 'ProtBMI–BMI':'r', 'ChemBMI–BMI':'g', 'CombiBMI–BMI':'m'}
tempDF = bmiDF[list(tempD.keys())]
tempDF1 = tempDF.describe()
tempDF1.loc['Skewness'] = stats.skew(tempDF)
display(tempDF1)

sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(4, 3))
for col_n in tempD.keys():
    sns.distplot(tempDF[col_n], label=col_n, color=tempD[col_n])
sns.despine()
plt.ylabel('Density')
plt.xlabel(r'$\Delta$'+'BMI [% BMI]')
plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
plt.show()

### 1-5. Difference in ∆BMI between omics

> In this version, P-values from Pearson's correlation tests are adjusted across all comparisons.  

In [None]:
#Prepare DF
tempL = ['MetBMI–BMI', 'ProtBMI–BMI', 'ChemBMI–BMI', 'CombiBMI–BMI', 'BaseBMI_class']
tempDF = bmiDF[tempL]
tempDF.columns = tempDF.columns.str.replace('–BMI', '')

#Check correlation matrix and extract lower triangle matrix
tempDF1 = tempDF.select_dtypes(include=[np.number])
tempDF1 = tempDF1.corr(method='pearson')
print('Pearson\'s r:')
display(tempDF1)
tempDF1 = tempDF1.where(np.tril(np.ones(tempDF1.shape), k=-1).astype(np.bool), other=np.nan)
tempDF1.index.rename('Variable1', inplace=True)
tempDF1 = tempDF1.reset_index().melt(var_name='Variable2', value_name='Pearson_r', id_vars=['Variable1'])
tempDF1 = tempDF1.dropna()

#Statistical tests
tempDF2 = pd.DataFrame(columns=['Xvar', 'Yvar', 'N', 'DoF', 'Pearson_r', 'Pval'])
for row_i in range(len(tempDF1)):
    xvar = tempDF1['Variable2'].iloc[row_i]
    yvar = tempDF1['Variable1'].iloc[row_i]
    #Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[xvar], tempDF[yvar])
    size = len(tempDF)
    dof = size - 2
    tempDF2.loc[xvar+'-vs-'+yvar] = [xvar, yvar, size, dof, pearson_r, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF2['AdjPval'] = multi.multipletests(tempDF2['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF2.index.rename('ComparisonLabel', inplace=True)
display(tempDF2)
##Save
fileDir = './ExportData/'
ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
fileName = 'bBMI-BMI-difference.tsv'
tempDF2.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Visualization
tempD = {'Underweight':'blue', 'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
sns.set(style='ticks', font='Arial', context='talk')
p = sns.PairGrid(tempDF, hue='BaseBMI_class', hue_order=list(tempD.keys()), palette=tempD,
                 height=2, aspect=1, layout_pad=0.0)
p.map_lower(sns.scatterplot, edgecolor='0.3', alpha=0.5, s=25)
p.map_diag(sns.distplot, axlabel=False, kde_kws={'alpha':0.8}, hist_kws={'edgecolor':'white', 'alpha':0.5})
for i, j in zip(*np.triu_indices_from(p.axes, 1)):
    p.axes[i, j].set_visible(False)
for i, j in zip(*np.tril_indices_from(p.axes, 0)):
    p.axes[i, j].set(xlim=(-40, 85), xticks=np.arange(-25, 75.1, 25),
                     ylim=(-40, 85), yticks=np.arange(-25, 75.1, 25))
for i, j in zip(*np.tril_indices_from(p.axes, -1)):
    p.axes[i, j].grid(axis='both', linestyle='--', color='gray', alpha=0.3)
    #Annotate Pearson's correlation
    xvar = tempDF.columns.tolist()[j]
    yvar = tempDF.columns.tolist()[i]
    pearson_r = tempDF2['Pearson_r'].loc[xvar+'-vs-'+yvar]
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF2['AdjPval'].loc[xvar+'-vs-'+yvar]
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    p.axes[i, j].annotate(text, xy=(0.025, 1.0), xycoords='axes fraction',
                          horizontalalignment='left', verticalalignment='top',
                          multialignment='left', fontsize='x-small', color='k')
pl = plt.legend(bbox_to_anchor=(0.8, 4.1), loc='upper right', title='BMI class')
##Add sample size in lagend
for row_i in range(len(pl.get_texts())):
    bmi_class = pl.get_texts()[row_i].get_text()
    count = len(tempDF.loc[tempDF['BaseBMI_class']==bmi_class])
    pl.get_texts()[row_i].set_text(bmi_class+' ('+r'$n$'+' = '+f'{count:,}'+')')
##Add xy label annotation
label = r'$\Delta$'+'BMI (predicted '+r'$-$'+' measured) [% BMI]'
p.fig.text(x=0.545, y=0.0,#Manual adjustment
           s=label, fontsize='large',
           verticalalignment='top', horizontalalignment='center')
p.fig.text(x=-0.01, y=0.545,#Manual adjustment
           s=label, fontsize='large',
           verticalalignment='center', horizontalalignment='right', rotation='vertical')
##Save
fileDir = './ExportFigures/'
ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
fileName = 'bBMI-BMI-difference.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

## 2. Metabolomics space

### 2-1. Standardization

In [None]:
#Import the cleaned baseline omics dataframes
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_RF-imputation_'
fileName = 'baseline-metDF-with-RF-imputation.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')
##Drop BMI and covariates
tempL = ['log_BaseBMI', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Race']
tempDF = tempDF.drop(columns=tempL)

#Z-score transformation
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
tempA = scaler.fit_transform(tempDF)#Column direction
tempDF = pd.DataFrame(data=tempA, index=tempDF.index, columns=tempDF.columns)

display(tempDF.describe(include='all'))

zscoreDF = tempDF

### 2-2. Misclassification label

In [None]:
#Misclassification label dataframe
labelDF = bmiDF.loc[bmiDF['BaseMetBMI_class']!='NotCalculated'][['BaseBMI_class', 'BaseMetBMI_class']]
print('Sample size of Z-score DF:', len(zscoreDF))
print('Sample size of label DF:', len(labelDF))

#Misclassification
tempL = []
for row_n in labelDF.index.tolist():
    if labelDF.loc[row_n, 'BaseBMI_class'] == labelDF.loc[row_n, 'BaseMetBMI_class']:
        tempL.append('Matched')
    else:
        tempL.append('Mismatched')
labelDF['Misclassification'] = tempL

print('\nSummary')
tempL = ['Underweight', 'Normal', 'Overweight', 'Obese']
for bmi_class in tempL:
    print(' • BMI class: '+bmi_class)
    tempDF = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    print('    - Sample size:', len(tempDF))
    nMismatched = len(tempDF.loc[tempDF['Misclassification']=='Mismatched'])
    print('    - Mismatched:', nMismatched, '(', nMismatched/len(tempDF)*100, '%)')
    tempS = tempDF['BaseMetBMI_class'].value_counts()
    tempDF = pd.DataFrame({'Count': tempS, 'Percentage':tempS/len(tempDF)*100})
    print('    - Detail of MetBMI class:')
    display(tempDF)
    print('')

### 2-3. Color label for variables

In [None]:
#Prepare column color label based on the contribution in LASSO models
fileDir = './ExportData/'
ipynbName = '220802_Multiomics-BMI-NatMed1stRevision_BMI-LASSO-bcoef_'
fileName = 'MetBMI-BothSex-OLS.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
tempL = []
for row_n in tempDF.index.tolist():
    if row_n=='MetBMI model':
        tempL.append('NaN')
    else:
        if tempDF.loc[row_n, 'LASSObcoef'] > 0:
            tempL.append('Positive')
        elif tempDF.loc[row_n, 'LASSObcoef'] < 0:
            tempL.append('Negative')
        else:
            tempL.append('Error')
tempDF['Contribution'] = tempL
tempDF = tempDF.loc[tempDF.index!='MetBMI model']
tempD = {'Positive':'tab:red', 'Negative':'tab:blue'}
colorDF = pd.DataFrame({'Contribution\nin LASSO':tempDF['Contribution'].map(tempD)})
display(colorDF)

### 2-4. Hierarchical clustering: variables retained in ≥6 models and exhibiting ≥14% explained variance in BMI (top 15)

In [None]:
#Import the cleaned OLS regression result for LASSO variable
fileDir = './ExportData/'
ipynbName = '220802_Multiomics-BMI-NatMed1stRevision_BMI-LASSO-bcoef_'
fileName = 'MetBMI-BothSex-OLS.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
tempDF = tempDF.loc[tempDF['LASSOnZeros']<=4]
tempDF = tempDF.loc[tempDF['R2']>=14]
#tempDF = tempDF.iloc[0:15]
print('nrows:', len(tempDF))
print('Adjusted p-value < 0.05:', len(tempDF.loc[tempDF['AdjPval']<0.05]))
display(tempDF)

#Extract the variables
zscoreDF_select = zscoreDF.loc[:, zscoreDF.columns.isin(tempDF.index)]
print('Confirm Z-score DF:', zscoreDF_select.shape)

#Update color label
var_colorDF = colorDF.loc[colorDF.index.isin(zscoreDF_select.columns)]

In [None]:
print('All:')
#Visualization
tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
tempD2 = {'Matched':'white', 'Mismatched':'black'}
tempDF1 = pd.DataFrame({'BMI class':labelDF['BaseBMI_class'].map(tempD1),
                        'MetBMI class':labelDF['BaseMetBMI_class'].map(tempD1),
                        'Misclassification':labelDF['Misclassification'].map(tempD2)})
t_start = time.time()
sns.set(style='ticks', font='Arial', context='notebook')
cm = sns.clustermap(zscoreDF_select, method='ward', metric='euclidean',
                    row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                    row_colors=tempDF1, col_colors=var_colorDF,
                    cbar_pos=(0.12, 0.01, 0.02, 0.1),
                    figsize=(10, 30), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
cm.cax.set_title(r'$Z$'+'-score')
hm = cm.ax_heatmap.get_position()
rd = cm.ax_row_dendrogram.get_position()
cd = cm.ax_col_dendrogram.get_position()
cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0.5, rd.y0, rd.width*0.5, rd.height])
cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.25])
plt.show()
t_elapsed = time.time() - t_start
print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
#Statistical test
for nclusters in range(2, 6):
    print('• nCluster: '+str(nclusters))
    ##Perform hierarchical clustering for rows: public_client_id
    tempDF1 = pd.DataFrame(hierarchy.linkage(zscoreDF_select, method='ward', metric='euclidean'))
    ##Extract and merge cluster label
    tempS = pd.Series(hierarchy.fcluster(tempDF1, nclusters, criterion='maxclust'),
                      index=zscoreDF_select.index, name='Cluster', dtype=str)
    tempS = 'Cluster_'+tempS
    tempDF = pd.merge(labelDF, tempS, left_index=True, right_index=True)
    ##To check which cluster corresponds to sns.clustermap
    tempL1 = []
    for row_i in range(len(cm.ax_heatmap.get_yticklabels())):
        tempL1.append(cm.ax_heatmap.get_yticklabels()[row_i].get_text())
    tempDF1 = tempDF.loc[tempL1]
    print('  - For checking cluster label')
    for cluster in tempDF['Cluster'].unique().tolist():
        print('     - '+cluster+' includes', tempDF1.loc[tempDF1['Cluster']==cluster].index.tolist()[0:5])
    ##Summary
    tempDF = pd.crosstab(tempDF['Misclassification'], tempDF['Cluster'], margins=True)
    print('  - Misclassification vs. Cluster')
    display(tempDF)
    ##Fisher's exact test for each cluster
    tempDF = tempDF.drop(index=['All'])
    tempS = pd.Series(index=tempDF.drop(columns=['All']).columns, name='Pval')#For p-values
    for cluster in tempS.index.tolist():
        #Prepare 2x2 matrix
        tempDF1 = pd.DataFrame({cluster:tempDF[cluster],
                                'Others':tempDF['All']-tempDF[cluster]})
        if tempDF1.shape == (2, 2):
            #Fisher's exact test
            tempS[cluster] = stats.fisher_exact(tempDF1, alternative='two-sided')[1]
        else:
            tempS[cluster] = np.nan
    ##Multiple tests correction
    tempDF = pd.DataFrame({'Pval':tempS,
                           'AdjPval':multi.multipletests(tempS, alpha=0.05, method='bonferroni',
                                                         is_sorted=False, returnsorted=False)[1]})
    print('  - Fisher\'s exact test (two-sided, Bonferroni correction): cluster vs. the other clusters')
    display(tempDF)
    print('')
print('')

#BMI class
tempL = ['Normal', 'Overweight', 'Obese']
for bmi_class in tempL:
    print('BMI class: '+bmi_class)
    labelDF_temp = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    tempDF = zscoreDF_select.loc[zscoreDF_select.index.isin(labelDF_temp.index)]
    #Visualization
    tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
    tempD2 = {'Matched':'white', 'Mismatched':'black'}
    tempDF1 = pd.DataFrame({'BMI class':labelDF_temp['BaseBMI_class'].map(tempD1),
                            'MetBMI class':labelDF_temp['BaseMetBMI_class'].map(tempD1),
                            'Misclassification':labelDF_temp['Misclassification'].map(tempD2)})
    t_start = time.time()
    sns.set(style='ticks', font='Arial', context='notebook')
    cm = sns.clustermap(tempDF, method='ward', metric='euclidean',
                        row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                        row_colors=tempDF1, col_colors=var_colorDF,
                        cbar_pos=(0.12, 0.02, 0.02, 0.2),
                        figsize=(10, 15), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
    cm.cax.set_title(r'$Z$'+'-score')
    hm = cm.ax_heatmap.get_position()
    rd = cm.ax_row_dendrogram.get_position()
    cd = cm.ax_col_dendrogram.get_position()
    cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
    cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0.5, rd.y0, rd.width*0.5, rd.height])
    cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.25])
    plt.show()
    t_elapsed = time.time() - t_start
    print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
    #Statistical test
    for nclusters in range(2, 6):
        print('• nCluster: '+str(nclusters))
        ##Perform hierarchical clustering for rows: public_client_id
        tempDF1 = pd.DataFrame(hierarchy.linkage(tempDF, method='ward', metric='euclidean'))
        ##Extract and merge cluster label
        tempS = pd.Series(hierarchy.fcluster(tempDF1, nclusters, criterion='maxclust'),
                          index=tempDF.index, name='Cluster', dtype=str)
        tempS = 'Cluster_'+tempS
        tempDF2 = pd.merge(labelDF_temp, tempS, left_index=True, right_index=True)
        ##To check which cluster corresponds to sns.clustermap
        tempL1 = []
        for row_i in range(len(cm.ax_heatmap.get_yticklabels())):
            tempL1.append(cm.ax_heatmap.get_yticklabels()[row_i].get_text())
        tempDF1 = tempDF2.loc[tempL1]
        print('  - For checking cluster label')
        for cluster in tempDF2['Cluster'].unique().tolist():
            print('     - '+cluster+' includes', tempDF1.loc[tempDF1['Cluster']==cluster].index.tolist()[0:5])
        ##Summary
        tempDF1 = pd.crosstab(tempDF2['Misclassification'], tempDF2['Cluster'], margins=True)
        print('  - Misclassification vs. Cluster')
        display(tempDF1)
        ##Fisher's exact test for each cluster
        tempDF1 = tempDF1.drop(index=['All'])
        tempS = pd.Series(index=tempDF1.drop(columns=['All']).columns, name='Pval')#For p-values
        for cluster in tempS.index.tolist():
            #Prepare 2x2 matrix
            tempDF2 = pd.DataFrame({cluster:tempDF1[cluster],
                                    'Others':tempDF1['All']-tempDF1[cluster]})
            if tempDF2.shape == (2, 2):
                #Fisher's exact test
                tempS[cluster] = stats.fisher_exact(tempDF2, alternative='two-sided')[1]
            else:
                tempS[cluster] = np.nan
        ##Multiple tests correction
        tempDF1 = pd.DataFrame({'Pval':tempS,
                               'AdjPval':multi.multipletests(tempS, alpha=0.05, method='bonferroni',
                                                             is_sorted=False, returnsorted=False)[1]})
        print('  - Fisher\'s exact test (two-sided, Bonferroni correction): cluster vs. the other clusters')
        display(tempDF1)
        print('')
    print('')

In [None]:
#Plot for paper
tempL = ['Normal', 'Obese']
for bmi_class in tempL:
    print('BMI class: '+bmi_class)
    labelDF_temp = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    tempDF = zscoreDF_select.loc[zscoreDF_select.index.isin(labelDF_temp.index)]
    #Visualization
    tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
    tempD2 = {'Matched':'white', 'Mismatched':'black'}
    tempDF1 = pd.DataFrame({'BMI class':labelDF_temp['BaseBMI_class'].map(tempD1),
                            'MetBMI class':labelDF_temp['BaseMetBMI_class'].map(tempD1),
                            'Misclassification':labelDF_temp['Misclassification'].map(tempD2)})
    t_start = time.time()
    sns.set(style='ticks', font='Arial', context='talk')
    cm = sns.clustermap(tempDF, method='ward', metric='euclidean',
                        row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                        row_colors=tempDF1, col_colors=var_colorDF, xticklabels=True, yticklabels=False,
                        dendrogram_ratio=(0.2, 0.1), colors_ratio=(0.045, 0.03),
                        cbar_pos=(0.85, 0.43, 0.3, 0.02), cbar_kws={"orientation": "horizontal"},
                        figsize=(8, 13.45), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
    cm.cax.set_title(r'$Z$'+'-score', size='medium', verticalalignment='bottom')
    cm.cax.tick_params(labelsize='small')
    bottom, top = cm.ax_heatmap.get_ylim()
    cm.ax_heatmap.set_ylim(bottom + 0.5, top - 0.5)##To avoid half cut of first and last rows
    hm = cm.ax_heatmap.get_position()
    rd = cm.ax_row_dendrogram.get_position()
    cd = cm.ax_col_dendrogram.get_position()
    cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
    cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0, rd.y0, rd.width, rd.height])
    cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.5])
    cm.ax_heatmap.set_xlabel('')
    #cm.ax_heatmap.set_ylabel('Participant ('+r'$n$ = '+f'{len(tempDF):,}'+')')
    cm.ax_heatmap.set_ylabel('')
    #cm.ax_col_colors.yaxis.set_ticks_position('left')
    ##row/column color bar legend (axis is same with cm.cax!)
    row_legend1 = mpatches.Patch(color='tab:green', label='Normal')
    row_legend2 = mpatches.Patch(color='tab:orange', label='Overweight')
    row_legend3 = mpatches.Patch(color='tab:red', label='Obese')
    legend1 = plt.legend(handles=[row_legend1, row_legend2, row_legend3], fontsize='medium',
                         title='BMI/MetBMI class', title_fontsize='medium',
                         bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=3.5, frameon=False)
    plt.gca().add_artist(legend1)
    row_legend = mpatches.Patch(color='black', label='Mismatched')
    legend2 = plt.legend(handles=[row_legend], fontsize='medium',
                         title='Misclassification', title_fontsize='medium',
                         bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=9.5, frameon=False)
    plt.gca().add_artist(legend2)
    col_legend1 = mpatches.Patch(color='tab:red', label='Positive')
    col_legend2 = mpatches.Patch(color='tab:blue', label='Negative')
    plt.legend(handles=[col_legend1, col_legend2], fontsize='medium',
               title='Contribution in LASSO', title_fontsize='medium',
               bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=15, frameon=False)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
    fileName = 'clusters-15metabolites-'+bmi_class+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    t_elapsed = time.time() - t_start
    print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
    print('')

## 3. Proteomics space

### 3-1. Standardization

In [None]:
#Import the cleaned baseline omics dataframes
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_RF-imputation_'
fileName = 'baseline-protDF-with-RF-imputation.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')
##Drop BMI and covariates
tempL = ['log_BaseBMI', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Race']
tempDF = tempDF.drop(columns=tempL)

#Z-score transformation
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
tempA = scaler.fit_transform(tempDF)#Column direction
tempDF = pd.DataFrame(data=tempA, index=tempDF.index, columns=tempDF.columns)

display(tempDF.describe(include='all'))

zscoreDF = tempDF

### 3-2. Misclassification label

In [None]:
#Misclassification label dataframe
labelDF = bmiDF.loc[bmiDF['BaseProtBMI_class']!='NotCalculated'][['BaseBMI_class', 'BaseProtBMI_class']]
print('Sample size of Z-score DF:', len(zscoreDF))
print('Sample size of label DF:', len(labelDF))

#Misclassification
tempL = []
for row_n in labelDF.index.tolist():
    if labelDF.loc[row_n, 'BaseBMI_class'] == labelDF.loc[row_n, 'BaseProtBMI_class']:
        tempL.append('Matched')
    else:
        tempL.append('Mismatched')
labelDF['Misclassification'] = tempL

print('\nSummary')
tempL = ['Underweight', 'Normal', 'Overweight', 'Obese']
for bmi_class in tempL:
    print(' • BMI class: '+bmi_class)
    tempDF = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    print('    - Sample size:', len(tempDF))
    nMismatched = len(tempDF.loc[tempDF['Misclassification']=='Mismatched'])
    print('    - Mismatched:', nMismatched, '(', nMismatched/len(tempDF)*100, '%)')
    tempS = tempDF['BaseProtBMI_class'].value_counts()
    tempDF = pd.DataFrame({'Count': tempS, 'Percentage':tempS/len(tempDF)*100})
    print('    - Detail of ProtBMI class:')
    display(tempDF)
    print('')

### 3-3. Color label for variables

In [None]:
#Prepare column color label based on the contribution in LASSO models
fileDir = './ExportData/'
ipynbName = '220802_Multiomics-BMI-NatMed1stRevision_BMI-LASSO-bcoef_'
fileName = 'ProtBMI-BothSex-OLS.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
tempL = []
for row_n in tempDF.index.tolist():
    if row_n=='ProtBMI model':
        tempL.append('NaN')
    else:
        if tempDF.loc[row_n, 'LASSObcoef'] > 0:
            tempL.append('Positive')
        elif tempDF.loc[row_n, 'LASSObcoef'] < 0:
            tempL.append('Negative')
        else:
            tempL.append('Error')
tempDF['Contribution'] = tempL
tempDF = tempDF.loc[tempDF.index!='ProtBMI model']
tempD = {'Positive':'tab:red', 'Negative':'tab:blue'}
colorDF = pd.DataFrame({'Contribution\nin LASSO':tempDF['Contribution'].map(tempD)})
display(colorDF)

### 3-4. Hierarchical clustering: variables retained in ≥8 models and exhibiting ≥10% explained variance in BMI (top 15)

In [None]:
#Import the cleaned OLS regression result for LASSO variable
fileDir = './ExportData/'
ipynbName = '220802_Multiomics-BMI-NatMed1stRevision_BMI-LASSO-bcoef_'
fileName = 'ProtBMI-BothSex-OLS.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
tempDF = tempDF.loc[tempDF['LASSOnZeros']<=2]
tempDF = tempDF.loc[tempDF['R2']>=10]
#tempDF = tempDF.iloc[0:15]
print('nrows:', len(tempDF))
print('Adjusted p-value < 0.05:', len(tempDF.loc[tempDF['AdjPval']<0.05]))
display(tempDF)

#Extract the variables
zscoreDF_select = zscoreDF.loc[:, zscoreDF.columns.isin(tempDF.index)]
print('Confirm Z-score DF:', zscoreDF_select.shape)

#Update color label
var_colorDF = colorDF.loc[colorDF.index.isin(zscoreDF_select.columns)]

In [None]:
print('All:')
#Visualization
tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
tempD2 = {'Matched':'white', 'Mismatched':'black'}
tempDF1 = pd.DataFrame({'BMI class':labelDF['BaseBMI_class'].map(tempD1),
                        'ProtBMI class':labelDF['BaseProtBMI_class'].map(tempD1),
                        'Misclassification':labelDF['Misclassification'].map(tempD2)})
t_start = time.time()
sns.set(style='ticks', font='Arial', context='notebook')
cm = sns.clustermap(zscoreDF_select, method='ward', metric='euclidean',
                    row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                    row_colors=tempDF1, col_colors=var_colorDF,
                    cbar_pos=(0.12, 0.825, 0.15, 0.01), cbar_kws={"orientation": "horizontal"},
                    figsize=(10, 24), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
cm.cax.set_title(r'$Z$'+'-score')
hm = cm.ax_heatmap.get_position()
rd = cm.ax_row_dendrogram.get_position()
cd = cm.ax_col_dendrogram.get_position()
cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0.5, rd.y0, rd.width*0.5, rd.height])
cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.25])
plt.show()
t_elapsed = time.time() - t_start
print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
#Statistical test
for nclusters in range(2, 6):
    print('• nCluster: '+str(nclusters))
    ##Perform hierarchical clustering for rows: public_client_id
    tempDF1 = pd.DataFrame(hierarchy.linkage(zscoreDF_select, method='ward', metric='euclidean'))
    ##Extract and merge cluster label
    tempS = pd.Series(hierarchy.fcluster(tempDF1, nclusters, criterion='maxclust'),
                      index=zscoreDF_select.index, name='Cluster', dtype=str)
    tempS = 'Cluster_'+tempS
    tempDF = pd.merge(labelDF, tempS, left_index=True, right_index=True)
    ##To check which cluster corresponds to sns.clustermap
    tempL1 = []
    for row_i in range(len(cm.ax_heatmap.get_yticklabels())):
        tempL1.append(cm.ax_heatmap.get_yticklabels()[row_i].get_text())
    tempDF1 = tempDF.loc[tempL1]
    print('  - For checking cluster label')
    for cluster in tempDF['Cluster'].unique().tolist():
        print('     - '+cluster+' includes', tempDF1.loc[tempDF1['Cluster']==cluster].index.tolist()[0:5])
    ##Summary
    tempDF = pd.crosstab(tempDF['Misclassification'], tempDF['Cluster'], margins=True)
    print('  - Misclassification vs. Cluster')
    display(tempDF)
    ##Fisher's exact test for each cluster
    tempDF = tempDF.drop(index=['All'])
    tempS = pd.Series(index=tempDF.drop(columns=['All']).columns, name='Pval')#For p-values
    for cluster in tempS.index.tolist():
        #Prepare 2x2 matrix
        tempDF1 = pd.DataFrame({cluster:tempDF[cluster],
                                'Others':tempDF['All']-tempDF[cluster]})
        if tempDF1.shape == (2, 2):
            #Fisher's exact test
            tempS[cluster] = stats.fisher_exact(tempDF1, alternative='two-sided')[1]
        else:
            tempS[cluster] = np.nan
    ##Multiple tests correction
    tempDF = pd.DataFrame({'Pval':tempS,
                           'AdjPval':multi.multipletests(tempS, alpha=0.05, method='bonferroni',
                                                         is_sorted=False, returnsorted=False)[1]})
    print('  - Fisher\'s exact test (two-sided, Bonferroni correction): cluster vs. the other clusters')
    display(tempDF)
    print('')
print('')

#BMI class
tempL = ['Normal', 'Overweight', 'Obese']
for bmi_class in tempL:
    print('BMI class: '+bmi_class)
    labelDF_temp = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    tempDF = zscoreDF_select.loc[zscoreDF_select.index.isin(labelDF_temp.index)]
    #Visualization
    tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
    tempD2 = {'Matched':'white', 'Mismatched':'black'}
    tempDF1 = pd.DataFrame({'BMI class':labelDF_temp['BaseBMI_class'].map(tempD1),
                            'ProtBMI class':labelDF_temp['BaseProtBMI_class'].map(tempD1),
                            'Misclassification':labelDF_temp['Misclassification'].map(tempD2)})
    t_start = time.time()
    sns.set(style='ticks', font='Arial', context='notebook')
    cm = sns.clustermap(tempDF, method='ward', metric='euclidean',
                        row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                        row_colors=tempDF1, col_colors=var_colorDF,
                        cbar_pos=(0.12, 0.825, 0.15, 0.02), cbar_kws={"orientation": "horizontal"},
                        figsize=(10, 12), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
    cm.cax.set_title(r'$Z$'+'-score')
    hm = cm.ax_heatmap.get_position()
    rd = cm.ax_row_dendrogram.get_position()
    cd = cm.ax_col_dendrogram.get_position()
    cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
    cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0.5, rd.y0, rd.width*0.5, rd.height])
    cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.25])
    plt.show()
    t_elapsed = time.time() - t_start
    print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
    #Statistical test
    for nclusters in range(2, 6):
        print('• nCluster: '+str(nclusters))
        ##Perform hierarchical clustering for rows: public_client_id
        tempDF1 = pd.DataFrame(hierarchy.linkage(tempDF, method='ward', metric='euclidean'))
        ##Extract and merge cluster label
        tempS = pd.Series(hierarchy.fcluster(tempDF1, nclusters, criterion='maxclust'),
                          index=tempDF.index, name='Cluster', dtype=str)
        tempS = 'Cluster_'+tempS
        tempDF2 = pd.merge(labelDF_temp, tempS, left_index=True, right_index=True)
        ##To check which cluster corresponds to sns.clustermap
        tempL1 = []
        for row_i in range(len(cm.ax_heatmap.get_yticklabels())):
            tempL1.append(cm.ax_heatmap.get_yticklabels()[row_i].get_text())
        tempDF1 = tempDF2.loc[tempL1]
        print('  - For checking cluster label')
        for cluster in tempDF2['Cluster'].unique().tolist():
            print('     - '+cluster+' includes', tempDF1.loc[tempDF1['Cluster']==cluster].index.tolist()[0:5])
        ##Summary
        tempDF1 = pd.crosstab(tempDF2['Misclassification'], tempDF2['Cluster'], margins=True)
        print('  - Misclassification vs. Cluster')
        display(tempDF1)
        ##Fisher's exact test for each cluster
        tempDF1 = tempDF1.drop(index=['All'])
        tempS = pd.Series(index=tempDF1.drop(columns=['All']).columns, name='Pval')#For p-values
        for cluster in tempS.index.tolist():
            #Prepare 2x2 matrix
            tempDF2 = pd.DataFrame({cluster:tempDF1[cluster],
                                    'Others':tempDF1['All']-tempDF1[cluster]})
            if tempDF2.shape == (2, 2):
                #Fisher's exact test
                tempS[cluster] = stats.fisher_exact(tempDF2, alternative='two-sided')[1]
            else:
                tempS[cluster] = np.nan
        ##Multiple tests correction
        tempDF1 = pd.DataFrame({'Pval':tempS,
                               'AdjPval':multi.multipletests(tempS, alpha=0.05, method='bonferroni',
                                                             is_sorted=False, returnsorted=False)[1]})
        print('  - Fisher\'s exact test (two-sided, Bonferroni correction): cluster vs. the other clusters')
        display(tempDF1)
        print('')
    print('')

In [None]:
#Plot for paper
tempL = ['Normal', 'Obese']
for bmi_class in tempL:
    print('BMI class: '+bmi_class)
    labelDF_temp = labelDF.loc[labelDF['BaseBMI_class']==bmi_class]
    tempDF = zscoreDF_select.loc[zscoreDF_select.index.isin(labelDF_temp.index)]
    #Visualization
    tempD1 = {'Underweight':'tab:blue', 'Normal':'tab:green', 'Overweight':'tab:orange', 'Obese':'tab:red'}
    tempD2 = {'Matched':'white', 'Mismatched':'black'}
    tempDF1 = pd.DataFrame({'BMI class':labelDF_temp['BaseBMI_class'].map(tempD1),
                            'ProtBMI class':labelDF_temp['BaseProtBMI_class'].map(tempD1),
                            'Misclassification':labelDF_temp['Misclassification'].map(tempD2)})
    t_start = time.time()
    sns.set(style='ticks', font='Arial', context='talk')
    cm = sns.clustermap(tempDF, method='ward', metric='euclidean',
                        row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None,
                        row_colors=tempDF1, col_colors=var_colorDF, xticklabels=True, yticklabels=False,
                        dendrogram_ratio=(0.2, 0.1), colors_ratio=(0.045, 0.03),
                        cbar_pos=(0.85, 0.575, 0.3, 0.02), cbar_kws={"orientation": "horizontal"},
                        figsize=(8, 10), **{'center':0, 'vmin':-2.58, 'vmax':2.58})
    cm.cax.set_title(r'$Z$'+'-score', size='medium', verticalalignment='bottom')
    cm.cax.tick_params(labelsize='small')
    bottom, top = cm.ax_heatmap.get_ylim()
    cm.ax_heatmap.set_ylim(bottom + 0.5, top - 0.5)##To avoid half cut of first and last rows
    hm = cm.ax_heatmap.get_position()
    rd = cm.ax_row_dendrogram.get_position()
    cd = cm.ax_col_dendrogram.get_position()
    cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width, hm.height])
    cm.ax_row_dendrogram.set_position([rd.x0+rd.width*0, rd.y0, rd.width, rd.height])
    cm.ax_col_dendrogram.set_position([cd.x0, cd.y0, cd.width, cd.height*0.5])
    cm.ax_heatmap.set_xlabel('')
    #cm.ax_heatmap.set_ylabel('Participant ('+r'$n$ = '+f'{len(tempDF):,}'+')')
    cm.ax_heatmap.set_ylabel('')
    #cm.ax_col_colors.yaxis.set_ticks_position('left')
    ##row/column color bar legend (axis is same with cm.cax!)
    row_legend1 = mpatches.Patch(color='tab:green', label='Normal')
    row_legend2 = mpatches.Patch(color='tab:orange', label='Overweight')
    row_legend3 = mpatches.Patch(color='tab:red', label='Obese')
    legend1 = plt.legend(handles=[row_legend1, row_legend2, row_legend3], fontsize='medium',
                         title='BMI/ProtBMI class', title_fontsize='medium',
                         bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=3.5, frameon=False)
    plt.gca().add_artist(legend1)
    row_legend = mpatches.Patch(color='black', label='Mismatched')
    legend2 = plt.legend(handles=[row_legend], fontsize='medium',
                         title='Misclassification', title_fontsize='medium',
                         bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=9.5, frameon=False)
    plt.gca().add_artist(legend2)
    col_legend1 = mpatches.Patch(color='tab:red', label='Positive')
    col_legend2 = mpatches.Patch(color='tab:blue', label='Negative')
    plt.legend(handles=[col_legend1, col_legend2], fontsize='medium',
               title='Contribution in LASSO', title_fontsize='medium',
               bbox_to_anchor=(0.5, 0), loc='upper center', borderaxespad=15, frameon=False)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
    fileName = 'clusters-15proteins-'+bmi_class+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    t_elapsed = time.time() - t_start
    print('• Elapsed time:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
    print('')

## — End of this notebook —