# Multiomics BMI Paper — Comparison of Biological BMI Models between LASSO and the Other Methods

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) compared the blood omics-based BMI models (model performance, BMI predictions, and predictor variables) between LASSO and the other methods (elastic net, ridge, and random forest).  

Input files:  
* Arivale baseline BMI predictions with LASSO models: 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-\[Female/Male/BothSex\].tsv  
* Arivale baseline BMI predictions with elastic net models: 220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-ElasticNet_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-\[Female/Male/BothSex\].tsv  
* Arivale baseline BMI predictions with ridge models: 220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-Ridge_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-\[Female/Male/BothSex\].tsv  
* Arivale baseline BMI predictions with random forest models: 220828_Multiomics-BMI-NatMed1stRevision_BMI-baseline-RF-wenceslaus_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-\[Female/Male/BothSex\].tsv  
* LASSO models of biological BMI: 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-BothSex-LASSObcoefs.tsv  
* Elastic net models of biological BMI: 220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-ElasticNet_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-BothSex-ElasticNetbcoefs.tsv  
* Ridge models of biological BMI: 220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-Ridge_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-BothSex-Ridgebcoefs.tsv  
* Random forest models of biological BMI: 220828_Multiomics-BMI-NatMed1stRevision_BMI-baseline-RF-wenceslaus_\[MetBMI/ProtBMI/ChemBMI/CombiBMI\]-BothSex-feature-importance.tsv  

Output figures and tables:  
* Supplementary Figure 4  
* Tables for Supplementary Data 10  

Original notebook (memo for my future tracing):  
* dalek:\[JupyterLab HOME\]/220621_Multiomics-BMI-NatMedRevision/220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time

from sklearn.metrics import r2_score
from statsmodels.stats import weightstats
from statsmodels.stats import multitest as multi
from decimal import Decimal, ROUND_HALF_UP
import sys
#!pip install venn
from venn import venn
import matplotlib.patches as mpatches

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

## 1. Prepare predictions and reproduce out-of-sample R2

### 1-1. LASSO

In [None]:
#LASSO
fileDir = './ExportData/'
ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
yvar = 'BMI'
tempL1 = ['Met'+yvar, 'Prot'+yvar, 'Chem'+yvar, 'Combi'+yvar]
tempL2 = ['Testing', 'log_Base'+yvar, 'Base'+yvar]
tempD1 = {}
tempD2 = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Clean all mesured and predicted values into a dataframe
    for yvar_model in tempL1:
        #Import the predictions dataframe
        fileName = yvar_model+'-'+sex+'.tsv'
        tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
        tempDF = tempDF.set_index('public_client_id')
        #Put aside the common part
        if yvar_model==tempL1[0]:
            tempDF1 = tempDF[tempL2]#Initialize
        tempDF = tempDF.drop(columns=tempL2)
        #Merge
        tempDF1 = pd.merge(tempDF1, tempDF, left_index=True, right_index=True, how='inner')
    ##Change column names
    tempDF1.columns = tempDF1.columns.str.replace('Base', '')
    
    #Reproduce out-of-sample R2
    tempL = tempDF1['Testing'].unique().tolist()
    tempDF2 = pd.DataFrame(index=pd.Index(tempL, name='Model'))
    for yvar_model in tempL1:
        tempL = []
        for model_n in tempDF2.index.tolist():
            tempS1 = tempDF1['log_'+yvar].loc[tempDF1['Testing']==model_n]
            tempS2 = tempDF1['log_'+yvar_model].loc[tempDF1['Testing']==model_n]
            tempL.append(r2_score(tempS1, tempS2, sample_weight=None))
        tempDF2[yvar_model] = tempL
    
    tempD1[sex] = tempDF1
    tempD2[sex] = tempDF2
    
    #Check
    print(sex)
    ##Reproduce the previous outputs (because R2 values are rounded in the following summary table)
    for yvar_model in tempL1:
        tempL = tempDF2[yvar_model].tolist()
        print(' - '+yvar_model)
        print('   - Out-of-sample R2 [Mean ± SD]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
        print('   - Out-of-sample R2 [Mean ± SEM]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
        print('   - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1['log_'+yvar], tempDF1['log_'+yvar_model]))
        print('   - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1[yvar], tempDF1[yvar_model]))
    print(' - Summary of out-of-sample R2 values')
    display(tempDF2.describe(include='all'))
    print(' - Summary of all measured and predicted values')
    display(tempDF1.describe(include='all'))
    print('')

predictDF_F_LASSO = tempD1['Female']
predictDF_M_LASSO = tempD1['Male']
predictDF_B_LASSO = tempD1['BothSex']
r2DF_F_LASSO = tempD2['Female']
r2DF_M_LASSO = tempD2['Male']
r2DF_B_LASSO = tempD2['BothSex']

> –> Surely reproduced, up to 15 decimal places. (The slight diffrence in the last decimal place was probably due to floating issue.)  

### 1-2. Elastic net

In [None]:
#Elastic net
fileDir = './ExportData/'
ipynbName = '220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-ElasticNet_'
yvar = 'BMI'
tempL1 = ['Met'+yvar, 'Prot'+yvar, 'Chem'+yvar, 'Combi'+yvar]
tempL2 = ['Testing', 'log_Base'+yvar, 'Base'+yvar]
tempD1 = {}
tempD2 = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Clean all mesured and predicted values into a dataframe
    for yvar_model in tempL1:
        #Import the predictions dataframe
        fileName = yvar_model+'-'+sex+'.tsv'
        tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
        tempDF = tempDF.set_index('public_client_id')
        #Put aside the common part
        if yvar_model==tempL1[0]:
            tempDF1 = tempDF[tempL2]#Initialize
        tempDF = tempDF.drop(columns=tempL2)
        #Merge
        tempDF1 = pd.merge(tempDF1, tempDF, left_index=True, right_index=True, how='inner')
    ##Change column names
    tempDF1.columns = tempDF1.columns.str.replace('Base', '')
    
    #Reproduce out-of-sample R2
    tempL = tempDF1['Testing'].unique().tolist()
    tempDF2 = pd.DataFrame(index=pd.Index(tempL, name='Model'))
    for yvar_model in tempL1:
        tempL = []
        for model_n in tempDF2.index.tolist():
            tempS1 = tempDF1['log_'+yvar].loc[tempDF1['Testing']==model_n]
            tempS2 = tempDF1['log_'+yvar_model].loc[tempDF1['Testing']==model_n]
            tempL.append(r2_score(tempS1, tempS2, sample_weight=None))
        tempDF2[yvar_model] = tempL
    
    tempD1[sex] = tempDF1
    tempD2[sex] = tempDF2
    
    #Check
    print(sex)
    ##Reproduce the previous outputs (because R2 values are rounded in the following summary table)
    for yvar_model in tempL1:
        tempL = tempDF2[yvar_model].tolist()
        print(' - '+yvar_model)
        print('   - Out-of-sample R2 [Mean ± SD]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
        print('   - Out-of-sample R2 [Mean ± SEM]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
        print('   - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1['log_'+yvar], tempDF1['log_'+yvar_model]))
        print('   - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1[yvar], tempDF1[yvar_model]))
    print(' - Summary of out-of-sample R2 values')
    display(tempDF2.describe(include='all'))
    print(' - Summary of all measured and predicted values')
    display(tempDF1.describe(include='all'))
    print('')

predictDF_F_EN = tempD1['Female']
predictDF_M_EN = tempD1['Male']
predictDF_B_EN = tempD1['BothSex']
r2DF_F_EN = tempD2['Female']
r2DF_M_EN = tempD2['Male']
r2DF_B_EN = tempD2['BothSex']

> –> Surely reproduced, up to 15 decimal places. (The slight diffrence in the last decimal place was probably due to floating issue.)  

### 1-3. Ridge

In [None]:
#Ridge
fileDir = './ExportData/'
ipynbName = '220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-Ridge_'
yvar = 'BMI'
tempL1 = ['Met'+yvar, 'Prot'+yvar, 'Chem'+yvar, 'Combi'+yvar]
tempL2 = ['Testing', 'log_Base'+yvar, 'Base'+yvar]
tempD1 = {}
tempD2 = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Clean all mesured and predicted values into a dataframe
    for yvar_model in tempL1:
        #Import the predictions dataframe
        fileName = yvar_model+'-'+sex+'.tsv'
        tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
        tempDF = tempDF.set_index('public_client_id')
        #Put aside the common part
        if yvar_model==tempL1[0]:
            tempDF1 = tempDF[tempL2]#Initialize
        tempDF = tempDF.drop(columns=tempL2)
        #Merge
        tempDF1 = pd.merge(tempDF1, tempDF, left_index=True, right_index=True, how='inner')
    ##Change column names
    tempDF1.columns = tempDF1.columns.str.replace('Base', '')
    
    #Reproduce out-of-sample R2
    tempL = tempDF1['Testing'].unique().tolist()
    tempDF2 = pd.DataFrame(index=pd.Index(tempL, name='Model'))
    for yvar_model in tempL1:
        tempL = []
        for model_n in tempDF2.index.tolist():
            tempS1 = tempDF1['log_'+yvar].loc[tempDF1['Testing']==model_n]
            tempS2 = tempDF1['log_'+yvar_model].loc[tempDF1['Testing']==model_n]
            tempL.append(r2_score(tempS1, tempS2, sample_weight=None))
        tempDF2[yvar_model] = tempL
    
    tempD1[sex] = tempDF1
    tempD2[sex] = tempDF2
    
    #Check
    print(sex)
    ##Reproduce the previous outputs (because R2 values are rounded in the following summary table)
    for yvar_model in tempL1:
        tempL = tempDF2[yvar_model].tolist()
        print(' - '+yvar_model)
        print('   - Out-of-sample R2 [Mean ± SD]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
        print('   - Out-of-sample R2 [Mean ± SEM]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
        print('   - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1['log_'+yvar], tempDF1['log_'+yvar_model]))
        print('   - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1[yvar], tempDF1[yvar_model]))
    print(' - Summary of out-of-sample R2 values')
    display(tempDF2.describe(include='all'))
    print(' - Summary of all measured and predicted values')
    display(tempDF1.describe(include='all'))
    print('')

predictDF_F_ridge = tempD1['Female']
predictDF_M_ridge = tempD1['Male']
predictDF_B_ridge = tempD1['BothSex']
r2DF_F_ridge = tempD2['Female']
r2DF_M_ridge = tempD2['Male']
r2DF_B_ridge = tempD2['BothSex']

> –> Surely reproduced, up to 15 decimal places. (The slight diffrence in the last decimal place was probably due to floating issue.)  

### 1-4. Random forest

In [None]:
#Random forest
fileDir = './ImportData/'
ipynbName = '220828_Multiomics-BMI-NatMed1stRevision_BMI-baseline-RF-wenceslaus_'
yvar = 'BMI'
tempL1 = ['Met'+yvar, 'Prot'+yvar, 'Chem'+yvar, 'Combi'+yvar]
tempL2 = ['Testing', 'log_Base'+yvar, 'Base'+yvar]
tempD1 = {}
tempD2 = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Clean all mesured and predicted values into a dataframe
    for yvar_model in tempL1:
        #Import the predictions dataframe
        fileName = yvar_model+'-'+sex+'.tsv'
        tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
        tempDF = tempDF.set_index('public_client_id')
        #Put aside the common part
        if yvar_model==tempL1[0]:
            tempDF1 = tempDF[tempL2]#Initialize
        tempDF = tempDF.drop(columns=tempL2)
        #Merge
        tempDF1 = pd.merge(tempDF1, tempDF, left_index=True, right_index=True, how='inner')
    ##Change column names
    tempDF1.columns = tempDF1.columns.str.replace('Base', '')
    
    #Reproduce out-of-sample R2
    tempL = tempDF1['Testing'].unique().tolist()
    tempDF2 = pd.DataFrame(index=pd.Index(tempL, name='Model'))
    for yvar_model in tempL1:
        tempL = []
        for model_n in tempDF2.index.tolist():
            tempS1 = tempDF1['log_'+yvar].loc[tempDF1['Testing']==model_n]
            tempS2 = tempDF1['log_'+yvar_model].loc[tempDF1['Testing']==model_n]
            tempL.append(r2_score(tempS1, tempS2, sample_weight=None))
        tempDF2[yvar_model] = tempL
    
    tempD1[sex] = tempDF1
    tempD2[sex] = tempDF2
    
    #Check
    print(sex)
    ##Reproduce the previous outputs (because R2 values are rounded in the following summary table)
    for yvar_model in tempL1:
        tempL = tempDF2[yvar_model].tolist()
        print(' - '+yvar_model)
        print('   - Out-of-sample R2 [Mean ± SD]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
        print('   - Out-of-sample R2 [Mean ± SEM]:',
              np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
        print('   - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1['log_'+yvar], tempDF1['log_'+yvar_model]))
        print('   - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
              stats.pearsonr(tempDF1[yvar], tempDF1[yvar_model]))
    print(' - Summary of out-of-sample R2 values')
    display(tempDF2.describe(include='all'))
    print(' - Summary of all measured and predicted values')
    display(tempDF1.describe(include='all'))
    print('')

predictDF_F_RF = tempD1['Female']
predictDF_M_RF = tempD1['Male']
predictDF_B_RF = tempD1['BothSex']
r2DF_F_RF = tempD2['Female']
r2DF_M_RF = tempD2['Male']
r2DF_B_RF = tempD2['BothSex']

> –> Surely reproduced, up to 15 decimal places. (The slight diffrence in the last decimal place was probably due to floating issue.)  

## 2. Out-of-sample R2

In [None]:
tempD1 = {'LASSO':r2DF_B_LASSO, 'Elastic net':r2DF_B_EN,
          'Ridge':r2DF_B_ridge, 'Random forest':r2DF_B_RF}
tempD2 = {'LASSO':'LASSO', 'Elastic net':'EN',
          'Ridge':'Ridge', 'Random forest':'RF'}
tempD3 = {'MetBMI':'Metabolomics', 'ProtBMI':'Proteomics',
          'ChemBMI':'Clinical labs', 'CombiBMI':'Combined omics'}
tempD4 = {'Metabolomics':'b', 'Proteomics':'r',
          'Clinical labs':'g', 'Combined omics':'m'}

#Prepare DF
tempD = {}
for method in tempD1.keys():
    tempDF = tempD1[method].copy()
    method_label = tempD2[method]
    tempDF.columns = method_label+':'+tempDF.columns
    tempD[method] = tempDF
tempDF1 = pd.concat(list(tempD.values()), axis=1)
tempL = [method_label+':'+yvar_model for yvar_model in tempD3.keys() for method_label in tempD2.values()]
tempDF1 = tempDF1[tempL]#Sort just for easily checking the summary
display(tempDF1.describe())

#Statistical tests
tempD = {}
for yvar_model in tempD3.keys():
    control = list(tempD2.values())[0]
    tempL = list(tempD2.values())[1:]
    tempDF = pd.DataFrame(columns=['Control', 'Contrast', 'Control_N', 'contrast_N', 'DoF', 'tStat', 'Pval'])
    for contrast in tempL:
        tempS1 = tempDF1[control+':'+yvar_model]
        tempS2 = tempDF1[contrast+':'+yvar_model]
        #Two-sided Welch's t-test
        tstat, pval, dof = weightstats.ttest_ind(tempS1, tempS2,
                                                 alternative='two-sided', usevar='unequal')
        size1 = len(tempS1)
        size2 = len(tempS2)
        tempDF.loc[contrast+'-vs-'+control] = [control, contrast, size1, size2, dof, tstat, pval]
    ##P-value adjustment (within omics) by using Benjamini–Hochberg method
    tempDF['AdjPval_omics'] = multi.multipletests(tempDF['Pval'], alpha=0.05, method='fdr_bh',
                                                  is_sorted=False, returnsorted=False)[1]
    tempDF['Category'] = tempD3[yvar_model]
    tempD[yvar_model] = tempDF
tempDF2 = pd.concat(list(tempD.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF2['AdjPval_all'] = multi.multipletests(tempDF2['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF2.index.rename('ComparisonLabel', inplace=True)
tempDF2 = tempDF2.reset_index().set_index(['Category', 'ComparisonLabel'])
display(tempDF2)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'R2-comparison-BothSex.tsv'
tempDF2.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Prepare a dictionary for inverse mapping
tempD = {}
for key in tempD2.keys():
    value = tempD2[key]
    tempD[value] = key

#Visualization
axis_ymin = 0.0
axis_ymax = 1.15
ymin = 0.0
ymax = 0.8
yinter = 0.2
aline_ymin = 0.825
aline_yinter = 0.1
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD3),
                         figsize=(1+2*len(tempD3), 3), sharex=True, sharey=True)
for ax_i, ax in enumerate(axes.flat):
    yvar_model = list(tempD3.keys())[ax_i]
    category = tempD3[yvar_model]
    #Prepare DF while cleaning label text
    tempDF = tempDF1.loc[:, tempDF1.columns.str.contains(yvar_model)]
    tempDF.columns = tempDF.columns.str.replace(':'+yvar_model, '')
    tempDF = tempDF.melt(var_name='Method', value_name='R2', value_vars=tempDF.columns.tolist())
    tempDF['Method'] = tempDF['Method'].map(tempD)
    #Plot
    sns.barplot(data=tempDF, y='R2', x='Method', order=tempD2.keys(),
                color=tempD4[category], dodge=False,
                ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black', ax=ax)
    sns.stripplot(data=tempDF, y='R2', x='Method', order=tempD2.keys(),
                  dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4, ax=ax)
    #P-value annotation
    tempDF = tempDF2.loc[category]#MultiIndex
    for row_i in range(len(tempDF)):
        #Control
        group_0 = tempDF['Control'].iloc[row_i]
        xcoord_0 = list(tempD2.keys()).index(tempD[group_0])
        #Contrast
        group_1 = tempDF['Contrast'].iloc[row_i]
        xcoord_1 = list(tempD2.keys()).index(tempD[group_1])
        #Standard point of marker
        xcoord = (xcoord_0+xcoord_1)/2
        ycoord = aline_ymin + aline_yinter*row_i
        #Add annotation lines
        aline_offset = yinter/10
        aline_length = yinter/10 + aline_offset
        ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                lw=1.5, c='k')
        #Retrieve P-value
        pval = tempDF['AdjPval_all'].iloc[row_i]
        if pval<0.001:
            label = '***'
        elif pval<0.01:
            label = '**'
        elif pval<0.05:
            label = '*'
        else:
            pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
            label = r'$P$'+' = '+pval_text
        #Add annotation text
        if label in ['***', '**', '*']:
            text_offset = yinter/25
            text_size = 'medium'
        else:
            text_offset = yinter/5
            text_size = 'x-small'
        ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                    horizontalalignment='center', verticalalignment='bottom',
                    fontsize=text_size, color='k')
    #Facet label
    ax.set_title(category, {'fontsize':'medium'})
    #Axis setting
    plt.setp(ax.get_xticklabels(), rotation=70,
             horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
    if ax_i==0:
        plt.setp(ax, xlabel='', ylabel='Out-of-sample '+r'$R^2$')
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
sns.despine()
plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'R2-comparison-BothSex.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

In [None]:
tempD1 = {'LASSO':r2DF_F_LASSO, 'Elastic net':r2DF_F_EN,
          'Ridge':r2DF_F_ridge, 'Random forest':r2DF_F_RF}
tempD2 = {'LASSO':r2DF_M_LASSO, 'Elastic net':r2DF_M_EN,
          'Ridge':r2DF_M_ridge, 'Random forest':r2DF_M_RF}
tempD1 = {'Female':tempD1, 'Male':tempD2}
tempD2 = {'LASSO':'LASSO', 'Elastic net':'EN',
          'Ridge':'Ridge', 'Random forest':'RF'}
tempD3 = {'MetBMI':'Metabolomics', 'ProtBMI':'Proteomics',
          'ChemBMI':'Clinical labs', 'CombiBMI':'Combined omics'}
tempD4 = {'Metabolomics':'b', 'Proteomics':'r',
          'Clinical labs':'g', 'Combined omics':'m'}

#Prepare DF
tempD5 = {}
for sex in tempD1.keys():
    tempD6 = tempD1[sex]
    tempD = {}
    for method in tempD6.keys():
        tempDF = tempD6[method].copy()
        method_label = tempD2[method]
        tempDF.columns = method_label+':'+tempDF.columns
        tempD[method] = tempDF
    tempDF1 = pd.concat(list(tempD.values()), axis=1)
    tempDF1['Sex'] = sex
    tempD5[sex] = tempDF1
tempDF1 = pd.concat(list(tempD5.values()), axis=0)
tempDF1 = tempDF1.reset_index().set_index(['Sex', 'Model'])
tempL = [method_label+':'+yvar_model for yvar_model in tempD3.keys() for method_label in tempD2.values()]
tempDF1 = tempDF1[tempL]#Sort just for easily checking the summary
for sex in tempD1.keys():#To display all summary in the Jupyter Notebook
    print(sex)
    display(tempDF1.loc[sex].describe())

#Statistical tests
tempD5 = {}
for sex in tempD1.keys():
    tempDF2 = tempDF1.loc[sex]#MultiIndex
    tempD = {}
    for yvar_model in tempD3.keys():
        control = list(tempD2.values())[0]
        tempL = list(tempD2.values())[1:]
        tempDF = pd.DataFrame(columns=['Control', 'Contrast', 'Control_N', 'contrast_N', 'DoF', 'tStat', 'Pval'])
        for contrast in tempL:
            tempS1 = tempDF2[control+':'+yvar_model]
            tempS2 = tempDF2[contrast+':'+yvar_model]
            #Two-sided Welch's t-test
            tstat, pval, dof = weightstats.ttest_ind(tempS1, tempS2,
                                                     alternative='two-sided', usevar='unequal')
            size1 = len(tempS1)
            size2 = len(tempS2)
            tempDF.loc[contrast+'-vs-'+control] = [control, contrast, size1, size2, dof, tstat, pval]
        ##P-value adjustment (within sex and omics) by using Benjamini–Hochberg method
        tempDF['AdjPval_sex-omics'] = multi.multipletests(tempDF['Pval'], alpha=0.05, method='fdr_bh',
                                                          is_sorted=False, returnsorted=False)[1]
        tempDF['Category'] = tempD3[yvar_model]
        tempD[yvar_model] = tempDF
    tempDF = pd.concat(list(tempD.values()), axis=0)
    ##P-value adjustment (within sex) by using Benjamini–Hochberg method
    tempDF['AdjPval_sex'] = multi.multipletests(tempDF['Pval'], alpha=0.05, method='fdr_bh',
                                                is_sorted=False, returnsorted=False)[1]
    tempDF['Sex'] = sex
    tempD5[sex] = tempDF
tempDF2 = pd.concat(list(tempD5.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF2['AdjPval_all'] = multi.multipletests(tempDF2['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF2.index.rename('ComparisonLabel', inplace=True)
tempDF2 = tempDF2.reset_index().set_index(['Sex', 'Category', 'ComparisonLabel'])
display(tempDF2)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'R2-comparison-FemaleMale.tsv'
tempDF2.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Prepare a dictionary for inverse mapping
tempD = {}
for key in tempD2.keys():
    value = tempD2[key]
    tempD[value] = key

#Visualization (per sex, for now)
for sex in tempD1.keys():
    axis_ymin = 0.0
    axis_ymax = 1.15
    ymin = 0.0
    ymax = 0.8
    yinter = 0.2
    aline_ymin = 0.825
    aline_yinter = 0.1
    sns.set(style='ticks', font='Arial', context='talk')
    fig, axes = plt.subplots(nrows=1, ncols=len(tempD3),
                             figsize=(1+2*len(tempD3), 3), sharex=True, sharey=True)
    for ax_i, ax in enumerate(axes.flat):
        yvar_model = list(tempD3.keys())[ax_i]
        category = tempD3[yvar_model]
        #Prepare DF while cleaning label text
        tempDF = tempDF1.loc[sex]#MultiIndex
        tempDF = tempDF.loc[:, tempDF.columns.str.contains(yvar_model)]
        tempDF.columns = tempDF.columns.str.replace(':'+yvar_model, '')
        tempDF = tempDF.melt(var_name='Method', value_name='R2', value_vars=tempDF.columns.tolist())
        tempDF['Method'] = tempDF['Method'].map(tempD)
        #Plot
        sns.barplot(data=tempDF, y='R2', x='Method', order=tempD2.keys(),
                    color=tempD4[category], dodge=False,
                    ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black', ax=ax)
        sns.stripplot(data=tempDF, y='R2', x='Method', order=tempD2.keys(),
                      dodge=False, jitter=0.3,
                      size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4, ax=ax)
        #P-value annotation
        tempDF = tempDF2.loc[sex]#MultiIndex
        tempDF = tempDF.loc[category]#MultiIndex
        for row_i in range(len(tempDF)):
            #Control
            group_0 = tempDF['Control'].iloc[row_i]
            xcoord_0 = list(tempD2.keys()).index(tempD[group_0])
            #Contrast
            group_1 = tempDF['Contrast'].iloc[row_i]
            xcoord_1 = list(tempD2.keys()).index(tempD[group_1])
            #Standard point of marker
            xcoord = (xcoord_0+xcoord_1)/2
            ycoord = aline_ymin + aline_yinter*row_i
            #Add annotation lines
            aline_offset = yinter/10
            aline_length = yinter/10 + aline_offset
            ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                    [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                    lw=1.5, c='k')
            #Retrieve P-value
            pval = tempDF['AdjPval_all'].iloc[row_i]
            if pval<0.001:
                label = '***'
            elif pval<0.01:
                label = '**'
            elif pval<0.05:
                label = '*'
            else:
                pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
                label = r'$P$'+' = '+pval_text
            #Add annotation text
            if label in ['***', '**', '*']:
                text_offset = yinter/25
                text_size = 'medium'
            else:
                text_offset = yinter/5
                text_size = 'x-small'
            ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                        horizontalalignment='center', verticalalignment='bottom',
                        fontsize=text_size, color='k')
        #Facet label
        ax.set_title(category, {'fontsize':'medium'})
        #Axis setting
        plt.setp(ax.get_xticklabels(), rotation=70,
                 horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
        if ax_i==0:
            plt.setp(ax, xlabel='', ylabel='Out-of-sample '+r'$R^2$')
        else:
            plt.setp(ax, xlabel='', ylabel='')
            plt.setp(ax.get_yticklabels(), visible=False)
    sns.despine()
    plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
    fig.suptitle(sex, size='medium',
                 verticalalignment='bottom', horizontalalignment='center', y=0.965)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
    fileName = 'R2-comparison-'+sex+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()

## 3. Predictions

In [None]:
tempD1 = {'LASSO':predictDF_B_LASSO, 'Elastic net':predictDF_B_EN,
          'Ridge':predictDF_B_ridge, 'Random forest':predictDF_B_RF}
tempD2 = {'Metabolomics':'MetBMI', 'Proteomics':'ProtBMI',
          'Clinical labs':'ChemBMI', 'Combined omics':'CombiBMI'}
tempD3 = {'Metabolomics':'b', 'Proteomics':'r',
          'Clinical labs':'g', 'Combined omics':'m'}
yvar = 'LASSO'
unit_label = ' [kg m'+r'$^{-2}$'+']'

#Prepare DFs per category
tempD4 = {}
for category in tempD2.keys():
    bbmi = tempD2[category]
    tempD = {}
    for method in tempD1.keys():
        tempDF = tempD1[method]
        tempS = tempDF[bbmi].copy()
        tempS.name = method
        tempD[method] = tempS
    tempDF = pd.concat(list(tempD.values()), axis=1)
    print(category)
    display(tempDF.describe())
    tempD4[category] = tempDF

#Statistical tests
tempD = {}
for category in tempD4.keys():
    tempDF = tempD4[category]
    tempDF1 = pd.DataFrame(columns=['Yvar', 'N', 'DoF', 'Pearson_r', 'Pval'])
    for method in tempD1.keys():
        if method!=yvar:
            xvar = method
            #Pearson's correlation
            pearson_r, pval = stats.pearsonr(tempDF[xvar], tempDF[yvar])
            size = len(tempDF)
            dof = size - 2
            tempDF1.loc[method] = [yvar, size, dof, pearson_r, pval]
    ##P-value adjustment (within omics) by using Benjamini–Hochberg method
    tempDF1['AdjPval_omics'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                                   is_sorted=False, returnsorted=False)[1]
    tempDF1.index.rename('Xvar', inplace=True)
    tempDF1['Category'] = category
    tempD[category] = tempDF1
tempDF1 = pd.concat(list(tempD.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF1['AdjPval_all'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF1['N'] = tempDF1['N'].astype('int64')#Otherwise, float64!
tempDF1['DoF'] = tempDF1['DoF'].astype('int64')#Otherwise, float64!
tempDF1 = tempDF1.reset_index().set_index(['Category', 'Xvar'])
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'bBMI-comparison-BothSex.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Visualization (per category, for now)
for category in tempD4.keys():
    sns.set(style='ticks', font='Arial', context='talk')
    fig, axes = plt.subplots(nrows=1, ncols=len(tempD1)-1,
                             figsize=(2.8*(len(tempD1)-1), 2.8+0.8), sharex=True, sharey=True)
    axis_xymin = 12.5
    axis_xymax = 57.5
    xymin = 20
    xymax = 50
    xyinter = 10
    #Set axis range first; otherwise, regression line can be truncated differently
    plt.setp(axes, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    plt.setp(axes, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    for ax_i, ax in enumerate(axes.flat):
        xvar = list(tempD1.keys())[ax_i+1]
        tempDF = tempD4[category]
        #Scatterplot with regression line
        sns.regplot(data=tempDF, x=xvar, y=yvar, color=tempD3[category],
                    scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                    scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':25}, ax=ax)
        #Draw Y=X as reference
        ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
                color='black', linestyle=(0, (1, 2)), zorder=0)
        #Annotate Pearson's correlation
        tempDF = tempDF1.loc[category]#MultiIndex
        pearson_r = tempDF['Pearson_r'].loc[xvar]
        r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
        pval = tempDF['AdjPval_all'].loc[xvar]
        below_limit = 0#Initialize
        if pval==1.0:
            pval_text = '1.0'
        else:
            if pval==0.0:#Due to smaller than the float minimum
                pval = sys.float_info.min
                print('P-value was smaller than the float minimum:', pval)
                below_limit = 1
            pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
            significand, exponent = pval_text.split(sep='E-')
            significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
            if significand=='10.0':
                significand = '1.0'
                exponent = str(int(exponent)-1)
            if int(exponent)>2:
                pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
            elif int(exponent)>0:
                pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
            else:
                pval_text = significand
        if below_limit==1:
            text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
        else:
            text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
        ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                    horizontalalignment='left', verticalalignment='top',
                    multialignment='left', fontsize='small', color='k')
        #Facet label (Fig title)
        if ax_i==np.median(range(len(tempD1)-1)):
            title = category+': '+tempD2[category]+unit_label
            ax.set_title(title, {'fontsize':'large'})
        #Axis setting
        if ax_i%(len(tempD1)-1)==0:
            plt.setp(ax, xlabel=xvar, ylabel=yvar)
        else:
            plt.setp(ax, xlabel=xvar, ylabel='')
            plt.setp(ax.get_yticklabels(), visible=False)
    sns.despine()
    #Reset and generate common axis title
    #plt.setp(axes, xlabel='', ylabel='')
    fig.tight_layout(pad=0.75)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
    fileName = tempD2[category]+'-comparison-BothSex.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()

## 4. Variables between LASSO and EN models

### 4-1. Import beta-coefficients

In [None]:
#LASSO
print('LASSO models')
tempD = {}
for bbmi in ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']:
    #Import the beta-coefficients
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = bbmi+'-BothSex-LASSObcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempDF = tempDF.drop(index=['Intercept'])
    tempD[bbmi] = tempDF
    
    #Check
    print(bbmi+':')
    print(' - Variables:', len(tempDF))
    #Variables with non-zero beta-coefficient
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    tempDF1 = tempDF1.sort_values(by='Mean', ascending=False)
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    print('')

bcoefDF_met_LASSO = tempD['MetBMI']
bcoefDF_prot_LASSO = tempD['ProtBMI']
bcoefDF_chem_LASSO = tempD['ChemBMI']
bcoefDF_combi_LASSO = tempD['CombiBMI']

In [None]:
#Elastic net
print('Elastic net models')
tempD = {}
for bbmi in ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']:
    #Import the beta-coefficients
    fileDir = './ExportData/'
    ipynbName = '220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-ElasticNet_'
    fileName = bbmi+'-BothSex-ElasticNetbcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempDF = tempDF.drop(index=['Intercept'])
    tempD[bbmi] = tempDF
    
    #Check
    print(bbmi+':')
    print(' - Variables:', len(tempDF))
    #Variables with non-zero beta-coefficient
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    tempDF1 = tempDF1.sort_values(by='Mean', ascending=False)
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    print('')

bcoefDF_met_EN = tempD['MetBMI']
bcoefDF_prot_EN = tempD['ProtBMI']
bcoefDF_chem_EN = tempD['ChemBMI']
bcoefDF_combi_EN = tempD['CombiBMI']

### 4-2. Metabolites

In [None]:
#Prepare target analytes
tempDF1 = bcoefDF_combi_LASSO.loc[bcoefDF_met_LASSO.index.tolist()]
tempDF2 = bcoefDF_combi_EN.loc[bcoefDF_met_EN.index.tolist()]
tempD1 = {'LASSO CombiBMI':tempDF1, 'LASSO MetBMI':bcoefDF_met_LASSO,
          'Elastic net MetBMI':bcoefDF_met_EN, 'Elastic net CombiBMI':tempDF2}
tempD2 = {'LASSO CombiBMI':'tab:blue', 'LASSO MetBMI':'tab:orange',
          'Elastic net MetBMI':'tab:green', 'Elastic net CombiBMI':'tab:red'}
analyte = 'metabolites'
title = 'Robustly retained metabolites'

#Prepare module sets
print('Variables with non-zero beta-coefficient in all 10 models')
tempD = {}
for measure in tempD1.keys():
    tempDF = tempD1[measure]
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(measure+':', len(tempDF1), 'per', len(tempDF), analyte+' (',
          len(tempDF1)/len(tempDF)*100, '%)')
    tempD[measure] = set(tempDF1.index.tolist())

#Check common region (1,1,1,1)
tempS = list(tempD.values())[0]#Initialize
for measure in tempD.keys():
    tempS = tempS & tempD[measure]
print(' -> Common (1,1,1,1):', len(tempS))
display(tempS)

#Venn diagram
sns.set(style='ticks', font='Arial', context='talk')
fig, ax = plt.subplots(figsize=(4, 4))
venn(tempD, fmt='{size:,}', cmap=list(tempD2.values()), legend_loc=None, ax=ax)
plt.setp(ax, ylim=(0.1, 0.875))#Otherwise, weird space...
##Add legend annotation
x_coord = [0.1, 0.1, 0.9, 0.9]
y_coord = [0.25, 0.7, 0.7, 0.25]
h_align = ['right', 'right', 'left', 'left']
v_align = ['top', 'bottom', 'bottom', 'top']
for i in range(len(tempD1)):
    key = list(tempD2.keys())[i]
    total = f'{len(tempD[key]):,}'
    ax.text(x_coord[i], y_coord[i], key+'\n('+total+' '+analyte+')',
            fontsize='small', multialignment='center',
            horizontalalignment=h_align[i], verticalalignment=v_align[i],
            bbox={'boxstyle':'round', 'facecolor':tempD2[key], 'pad':0.2, 'alpha':0.5})
ax.set_title(title, fontsize='medium')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-'+analyte+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 4-3. Proteins

In [None]:
#Prepare target analytes
tempDF1 = bcoefDF_combi_LASSO.loc[bcoefDF_prot_LASSO.index.tolist()]
tempDF2 = bcoefDF_combi_EN.loc[bcoefDF_prot_EN.index.tolist()]
tempD1 = {'LASSO CombiBMI':tempDF1, 'LASSO ProtBMI':bcoefDF_prot_LASSO,
          'Elastic net ProtBMI':bcoefDF_prot_EN, 'Elastic net CombiBMI':tempDF2}
tempD2 = {'LASSO CombiBMI':'tab:blue', 'LASSO ProtBMI':'tab:orange',
          'Elastic net ProtBMI':'tab:green', 'Elastic net CombiBMI':'tab:red'}
analyte = 'proteins'
title = 'Robustly retained proteins'

#Prepare module sets
print('Variables with non-zero beta-coefficient in all 10 models')
tempD = {}
for measure in tempD1.keys():
    tempDF = tempD1[measure]
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(measure+':', len(tempDF1), 'per', len(tempDF), analyte+' (',
          len(tempDF1)/len(tempDF)*100, '%)')
    tempD[measure] = set(tempDF1.index.tolist())

#Check common region (1,1,1,1)
tempS = list(tempD.values())[0]#Initialize
for measure in tempD.keys():
    tempS = tempS & tempD[measure]
print(' -> Common (1,1,1,1):', len(tempS))
display(tempS)

#Venn diagram
sns.set(style='ticks', font='Arial', context='talk')
fig, ax = plt.subplots(figsize=(4, 4))
venn(tempD, fmt='{size:,}', cmap=list(tempD2.values()), legend_loc=None, ax=ax)
plt.setp(ax, ylim=(0.1, 0.875))#Otherwise, weird space...
##Add legend annotation
x_coord = [0.1, 0.1, 0.9, 0.9]
y_coord = [0.25, 0.7, 0.7, 0.25]
h_align = ['right', 'right', 'left', 'left']
v_align = ['top', 'bottom', 'bottom', 'top']
for i in range(len(tempD1)):
    key = list(tempD2.keys())[i]
    total = f'{len(tempD[key]):,}'
    ax.text(x_coord[i], y_coord[i], key+'\n('+total+' '+analyte+')',
            fontsize='small', multialignment='center',
            horizontalalignment=h_align[i], verticalalignment=v_align[i],
            bbox={'boxstyle':'round', 'facecolor':tempD2[key], 'pad':0.2, 'alpha':0.5})
ax.set_title(title, fontsize='medium')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-'+analyte+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 4-4. Clinical lab tests

In [None]:
#Prepare target analytes
tempDF1 = bcoefDF_combi_LASSO.loc[bcoefDF_chem_LASSO.index.tolist()]
tempDF2 = bcoefDF_combi_EN.loc[bcoefDF_chem_EN.index.tolist()]
tempD1 = {'LASSO CombiBMI':tempDF1, 'LASSO ChemBMI':bcoefDF_chem_LASSO,
          'Elastic net ChemBMI':bcoefDF_chem_EN, 'Elastic net CombiBMI':tempDF2}
tempD2 = {'LASSO CombiBMI':'tab:blue', 'LASSO ChemBMI':'tab:orange',
          'Elastic net ChemBMI':'tab:green', 'Elastic net CombiBMI':'tab:red'}
analyte = 'tests'
title = 'Robustly retained clinical lab tests'

#Prepare module sets
print('Variables with non-zero beta-coefficient in all 10 models')
tempD = {}
for measure in tempD1.keys():
    tempDF = tempD1[measure]
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(measure+':', len(tempDF1), 'per', len(tempDF), analyte+' (',
          len(tempDF1)/len(tempDF)*100, '%)')
    tempD[measure] = set(tempDF1.index.tolist())

#Check common region (1,1,1,1)
tempS = list(tempD.values())[0]#Initialize
for measure in tempD.keys():
    tempS = tempS & tempD[measure]
print(' -> Common (1,1,1,1):', len(tempS))
display(tempS)

#Venn diagram
sns.set(style='ticks', font='Arial', context='talk')
fig, ax = plt.subplots(figsize=(4, 4))
venn(tempD, fmt='{size:,}', cmap=list(tempD2.values()), legend_loc=None, ax=ax)
plt.setp(ax, ylim=(0.1, 0.875))#Otherwise, weird space...
##Add legend annotation
x_coord = [0.1, 0.1, 0.9, 0.9]
y_coord = [0.25, 0.7, 0.7, 0.25]
h_align = ['right', 'right', 'left', 'left']
v_align = ['top', 'bottom', 'bottom', 'top']
for i in range(len(tempD1)):
    key = list(tempD2.keys())[i]
    total = f'{len(tempD[key]):,}'
    ax.text(x_coord[i], y_coord[i], key+'\n('+total+' '+analyte+')',
            fontsize='small', multialignment='center',
            horizontalalignment=h_align[i], verticalalignment=v_align[i],
            bbox={'boxstyle':'round', 'facecolor':tempD2[key], 'pad':0.2, 'alpha':0.5})
ax.set_title(title, fontsize='medium')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-'+analyte+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 4-5. Correlation of beta-coefficients

In [None]:
#All variables
tempD1 = {'LASSO':bcoefDF_met_LASSO, 'elastic net':bcoefDF_met_EN}
tempD2 = {'LASSO':bcoefDF_prot_LASSO, 'elastic net':bcoefDF_prot_EN}
tempD3 = {'LASSO':bcoefDF_chem_LASSO, 'elastic net':bcoefDF_chem_EN}
tempD4 = {'LASSO':bcoefDF_combi_LASSO, 'elastic net':bcoefDF_combi_EN}
tempD1 = {'MetBMI':tempD1, 'ProtBMI':tempD2, 'ChemBMI':tempD3, 'CombiBMI':tempD4}
tempD2 = {'MetBMI':'Metabolomics', 'ProtBMI':'Proteomics',
          'ChemBMI':'Clinical labs', 'CombiBMI':'Combined omics'}
tempD3 = {'Metabolomics':'b', 'Proteomics':'r', 'Clinical labs':'g', 'Combined omics':'m'}
yvar = 'LASSO'
xvar = 'elastic net'
xyvar_unit = '[log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]'

#Prepare DF
tempD = {}
for category in tempD1.keys():
    tempD4 = tempD1[category]
    tempDF1 = tempD4[xvar]
    tempS1 = tempDF1['Mean'].copy()
    tempS1.name = xvar+'_Bcoef'
    tempDF2 = tempD4[yvar]
    tempS2 = tempDF2['Mean'].copy()
    tempS2.name = yvar+'_Bcoef'
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    tempDF['Category'] = tempD2[category]
    
    tempD[category] = tempDF
    
    print(tempD2[category])
    display(tempDF.describe(include='all'))
tempDF = pd.concat(list(tempD.values()), axis=0)

#Statistical tests
tempDF1 = pd.DataFrame(columns=['N', 'DoF', 'Pearson_r', 'Pval'])
for category in tempD3.keys():
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF2[xvar+'_Bcoef'], tempDF2[yvar+'_Bcoef'])
    size = len(tempDF2)
    dof = size - 2
    tempDF1.loc[category] = [size, dof, pearson_r, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF1['AdjPval'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('Category', inplace=True)
tempDF1['N'] = tempDF1['N'].astype('int64')#Otherwise, float64!
tempDF1['DoF'] = tempDF1['DoF'].astype('int64')#Otherwise, float64!
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_all.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=2, ncols=2,
                         figsize=(3.5*2, 3.5*2-0.2), sharex=True, sharey=True)
axis_xymin = -0.0425
axis_xymax = 0.050
xymin = -0.04
xymax = 0.04
xyinter = 0.02
#Set axis range first; otherwise, regression line can be truncated differently
plt.setp(axes, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
plt.setp(axes, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
for ax_i, ax in enumerate(axes.flat):
    category = list(tempD3.keys())[ax_i]
    #Prepare DF
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Scatterplot with regression line
    sns.regplot(data=tempDF2, x=xvar+'_Bcoef', y=yvar+'_Bcoef', color=tempD3[category],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':25}, ax=ax)
    #Draw Y=X as reference
    #ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
    #        color='black', linestyle=(0, (1, 2)), zorder=0)
    #Draw Y=X=0 as reference
    ax.axvline(x=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    ax.axhline(y=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    #Annotate Pearson's correlation
    pearson_r = tempDF1['Pearson_r'].loc[category]
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF1['AdjPval'].loc[category]
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    #Facet label
    ax.set_title(category, {'fontsize':'large'})
sns.despine()
#Reset and generate common axis title
plt.setp(axes, xlabel='', ylabel='')
fig.tight_layout(pad=0.75)
fig.text(x=0.54, y=0.02,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+xvar+' models\n'+xyvar_unit,
         fontsize='medium', verticalalignment='top', horizontalalignment='center')
fig.text(x=0.0225, y=0.515,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+yvar+' models\n'+xyvar_unit,
         fontsize='medium', multialignment='center',
         verticalalignment='center', horizontalalignment='right', rotation='vertical')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_all.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

In [None]:
#Variables with non-zero beta-coefficients
tempD1 = {'LASSO':bcoefDF_met_LASSO, 'elastic net':bcoefDF_met_EN}
tempD2 = {'LASSO':bcoefDF_prot_LASSO, 'elastic net':bcoefDF_prot_EN}
tempD3 = {'LASSO':bcoefDF_chem_LASSO, 'elastic net':bcoefDF_chem_EN}
tempD4 = {'LASSO':bcoefDF_combi_LASSO, 'elastic net':bcoefDF_combi_EN}
tempD1 = {'MetBMI':tempD1, 'ProtBMI':tempD2, 'ChemBMI':tempD3, 'CombiBMI':tempD4}
tempD2 = {'MetBMI':'Metabolomics', 'ProtBMI':'Proteomics',
          'ChemBMI':'Clinical labs', 'CombiBMI':'Combined omics'}
tempD3 = {'Metabolomics':'b', 'Proteomics':'r', 'Clinical labs':'g', 'Combined omics':'m'}
yvar = 'LASSO'
xvar = 'elastic net'
xyvar_unit = '[log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]'

#Prepare DF
tempD = {}
for category in tempD1.keys():
    tempD4 = tempD1[category]
    tempDF1 = tempD4[xvar]
    tempS1 = tempDF1['Mean'].copy()
    tempS1.name = xvar+'_Bcoef'
    tempDF2 = tempD4[yvar]
    tempS2 = tempDF2['Mean'].copy()
    tempS2.name = yvar+'_Bcoef'
    #Variables with non-zero beta-coefficients in either method
    tempDF1 = tempDF1.loc[tempDF1['nZeros']!=10]
    tempDF2 = tempDF2.loc[tempDF2['nZeros']!=10]
    tempS = set(tempDF1.index.tolist()) | set(tempDF2.index.tolist())
    tempDF = pd.merge(tempS1.loc[tempS], tempS2.loc[tempS],
                      left_index=True, right_index=True, how='inner')
    tempDF['Category'] = tempD2[category]
    
    tempD[category] = tempDF
    
    print(tempD2[category])
    display(tempDF.describe(include='all'))
tempDF = pd.concat(list(tempD.values()), axis=0)

#Statistical tests
tempDF1 = pd.DataFrame(columns=['N', 'DoF', 'Pearson_r', 'Pval'])
for category in tempD3.keys():
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF2[xvar+'_Bcoef'], tempDF2[yvar+'_Bcoef'])
    size = len(tempDF2)
    dof = size - 2
    tempDF1.loc[category] = [size, dof, pearson_r, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF1['AdjPval'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('Category', inplace=True)
tempDF1['N'] = tempDF1['N'].astype('int64')#Otherwise, float64!
tempDF1['DoF'] = tempDF1['DoF'].astype('int64')#Otherwise, float64!
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_non-zero-in-any.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=2, ncols=2,
                         figsize=(3.5*2, 3.5*2-0.2), sharex=True, sharey=True)
axis_xymin = -0.0425
axis_xymax = 0.050
xymin = -0.04
xymax = 0.04
xyinter = 0.02
#Set axis range first; otherwise, regression line can be truncated differently
plt.setp(axes, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
plt.setp(axes, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
for ax_i, ax in enumerate(axes.flat):
    category = list(tempD3.keys())[ax_i]
    #Prepare DF
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Scatterplot with regression line
    sns.regplot(data=tempDF2, x=xvar+'_Bcoef', y=yvar+'_Bcoef', color=tempD3[category],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':25}, ax=ax)
    #Draw Y=X as reference
    #ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
    #        color='black', linestyle=(0, (1, 2)), zorder=0)
    #Draw Y=X=0 as reference
    ax.axvline(x=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    ax.axhline(y=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    #Annotate Pearson's correlation
    pearson_r = tempDF1['Pearson_r'].loc[category]
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF1['AdjPval'].loc[category]
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    #Facet label
    ax.set_title(category, {'fontsize':'large'})
sns.despine()
#Reset and generate common axis title
plt.setp(axes, xlabel='', ylabel='')
fig.tight_layout(pad=0.75)
fig.text(x=0.54, y=0.02,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+xvar+' models\n'+xyvar_unit,
         fontsize='medium', verticalalignment='top', horizontalalignment='center')
fig.text(x=0.0225, y=0.515,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+yvar+' models\n'+xyvar_unit,
         fontsize='medium', multialignment='center',
         verticalalignment='center', horizontalalignment='right', rotation='vertical')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_non-zero-in-any.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

In [None]:
#Variables with non-zero beta-coefficient in all 10 models
tempD1 = {'LASSO':bcoefDF_met_LASSO, 'elastic net':bcoefDF_met_EN}
tempD2 = {'LASSO':bcoefDF_prot_LASSO, 'elastic net':bcoefDF_prot_EN}
tempD3 = {'LASSO':bcoefDF_chem_LASSO, 'elastic net':bcoefDF_chem_EN}
tempD4 = {'LASSO':bcoefDF_combi_LASSO, 'elastic net':bcoefDF_combi_EN}
tempD1 = {'MetBMI':tempD1, 'ProtBMI':tempD2, 'ChemBMI':tempD3, 'CombiBMI':tempD4}
tempD2 = {'MetBMI':'Metabolomics', 'ProtBMI':'Proteomics',
          'ChemBMI':'Clinical labs', 'CombiBMI':'Combined omics'}
tempD3 = {'Metabolomics':'b', 'Proteomics':'r', 'Clinical labs':'g', 'Combined omics':'m'}
yvar = 'LASSO'
xvar = 'elastic net'
xyvar_unit = '[log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]'

#Prepare DF
tempD = {}
for category in tempD1.keys():
    tempD4 = tempD1[category]
    tempDF1 = tempD4[xvar]
    tempS1 = tempDF1['Mean'].copy()
    tempS1.name = xvar+'_Bcoef'
    tempDF2 = tempD4[yvar]
    tempS2 = tempDF2['Mean'].copy()
    tempS2.name = yvar+'_Bcoef'
    #Variables with non-zero beta-coefficients in bBMI or bWHtR
    tempDF1 = tempDF1.loc[tempDF1['nZeros']==0]
    tempDF2 = tempDF2.loc[tempDF2['nZeros']==0]
    tempS = set(tempDF1.index.tolist()) | set(tempDF2.index.tolist())
    tempDF = pd.merge(tempS1.loc[tempS], tempS2.loc[tempS],
                      left_index=True, right_index=True, how='inner')
    tempDF['Category'] = tempD2[category]
    
    tempD[category] = tempDF
    
    print(tempD2[category])
    display(tempDF.describe(include='all'))
tempDF = pd.concat(list(tempD.values()), axis=0)

#Statistical tests
tempDF1 = pd.DataFrame(columns=['N', 'DoF', 'Pearson_r', 'Pval'])
for category in tempD3.keys():
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF2[xvar+'_Bcoef'], tempDF2[yvar+'_Bcoef'])
    size = len(tempDF2)
    dof = size - 2
    tempDF1.loc[category] = [size, dof, pearson_r, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF1['AdjPval'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('Category', inplace=True)
tempDF1['N'] = tempDF1['N'].astype('int64')#Otherwise, float64!
tempDF1['DoF'] = tempDF1['DoF'].astype('int64')#Otherwise, float64!
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_non-zero-in-all.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=2, ncols=2,
                         figsize=(3.5*2, 3.5*2-0.2), sharex=True, sharey=True)
axis_xymin = -0.0425
axis_xymax = 0.050
xymin = -0.04
xymax = 0.04
xyinter = 0.02
#Set axis range first; otherwise, regression line can be truncated differently
plt.setp(axes, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
plt.setp(axes, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
for ax_i, ax in enumerate(axes.flat):
    category = list(tempD3.keys())[ax_i]
    #Prepare DF
    tempDF2 = tempDF.loc[tempDF['Category']==category]
    #Scatterplot with regression line
    sns.regplot(data=tempDF2, x=xvar+'_Bcoef', y=yvar+'_Bcoef', color=tempD3[category],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':25}, ax=ax)
    #Draw Y=X as reference
    #ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
    #        color='black', linestyle=(0, (1, 2)), zorder=0)
    #Draw Y=X=0 as reference
    ax.axvline(x=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    ax.axhline(y=0, color='black', linestyle=(0, (1, 2)), zorder=0)
    #Annotate Pearson's correlation
    pearson_r = tempDF1['Pearson_r'].loc[category]
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF1['AdjPval'].loc[category]
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    #Facet label
    ax.set_title(category, {'fontsize':'large'})
sns.despine()
#Reset and generate common axis title
plt.setp(axes, xlabel='', ylabel='')
fig.tight_layout(pad=0.75)
fig.text(x=0.54, y=0.02,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+xvar+' models\n'+xyvar_unit,
         fontsize='medium', verticalalignment='top', horizontalalignment='center')
fig.text(x=0.0225, y=0.515,#Manual adjustment
         s='Mean of '+r'$\beta$'+'-coefficients in '+yvar+' models\n'+xyvar_unit,
         fontsize='medium', multialignment='center',
         verticalalignment='center', horizontalalignment='right', rotation='vertical')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = 'LASSO-vs-ElasticNet-bcoefs_non-zero-in-all.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

## 5. Top 30 variables in ridge models

> Because beta-coefficients can be adjusted among the high-collinear variables and thus almost all variables are retained across 10 models in Ridge, it is not so simple to compare the beta-coefficients between LASSO and ridge models.  
> –> Hence, only the top 30 variables that had the highest absolute value of the mean beta-coefficient are checked.  

### 5-1. Import beta-coefficients

In [None]:
#Ridge
print('Ridge models')
tempD = {}
for bbmi in ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']:
    #Import the beta-coefficients
    fileDir = './ExportData/'
    ipynbName = '220827_Multiomics-BMI-NatMed1stRevision_BMI-baseline-Ridge_'
    fileName = bbmi+'-BothSex-Ridgebcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempDF = tempDF.drop(index=['Intercept'])
    tempD[bbmi] = tempDF
    
    #Check
    print(bbmi+':')
    print(' - Variables:', len(tempDF))
    #Variables with non-zero beta-coefficient
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    #Extract robust beta-coefficient: no zeros in all 10 models
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    tempDF1 = tempDF1.sort_values(by='Mean', ascending=False)
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    print('')

bcoefDF_met_ridge = tempD['MetBMI']
bcoefDF_prot_ridge = tempD['ProtBMI']
bcoefDF_chem_ridge = tempD['ChemBMI']
bcoefDF_combi_ridge = tempD['CombiBMI']

### 5-2. Metabolomics

In [None]:
tempDF = bcoefDF_met_ridge.copy()
topX = 30
method = 'Ridge'
bbmi_color = 'b'
bbmi = 'MetBMI'

#Top X variables based on the absolute value of the mean beta-coefficient
tempDF['AbsMean'] = np.abs(tempDF['Mean'])
tempDF = tempDF.sort_values(by='AbsMean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.sort_values(by='Mean', ascending=False)#Re-sort
tempDF = tempDF.drop(columns=['Mean', 'SD', 'nZeros', 'AbsMean'])
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='bcoef', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
p = sns.boxplot(data=tempDF, y='Variable', x='bcoef', color=bbmi_color, dodge=False, saturation=1,
                showfliers=True, flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                showcaps=True, notch=False)
p.set(xlim=(-0.06, 0.09), xticks=np.arange(-0.04, 0.081, 0.04))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
##Change default dull line color of sns.boxplot (saturation parameter is for patch)
for line in p.get_lines():
    line.set_color('k')
for box in p.artists:
    box.set_edgecolor('k')
##Add background color
for row_i in range(topX):
    if row_i%2 == 0:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=bbmi_color, alpha=0.2, zorder=0)
plt.ylabel('')
plt.xlabel(r'$\beta$'+'-coefficient in '+method.lower()+' model\n [log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-bcoef_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 5-3. Proteomics

In [None]:
tempDF = bcoefDF_prot_ridge.copy()
topX = 30
method = 'Ridge'
bbmi_color = 'r'
bbmi = 'ProtBMI'

#Top X variables based on the absolute value of the mean beta-coefficient
tempDF['AbsMean'] = np.abs(tempDF['Mean'])
tempDF = tempDF.sort_values(by='AbsMean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.sort_values(by='Mean', ascending=False)#Re-sort
tempDF = tempDF.drop(columns=['Mean', 'SD', 'nZeros', 'AbsMean'])
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='bcoef', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
p = sns.boxplot(data=tempDF, y='Variable', x='bcoef', color=bbmi_color, dodge=False, saturation=1,
                showfliers=True, flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                showcaps=True, notch=False)
p.set(xlim=(-0.06, 0.09), xticks=np.arange(-0.04, 0.081, 0.04))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
##Change default dull line color of sns.boxplot (saturation parameter is for patch)
for line in p.get_lines():
    line.set_color('k')
for box in p.artists:
    box.set_edgecolor('k')
##Add background color
for row_i in range(topX):
    if row_i%2 == 0:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=bbmi_color, alpha=0.2, zorder=0)
plt.ylabel('')
plt.xlabel(r'$\beta$'+'-coefficient in '+method.lower()+' model\n [log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-bcoef_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 5-4. Clinical labs

In [None]:
tempDF = bcoefDF_chem_ridge.copy()
topX = 30
method = 'Ridge'
bbmi_color = 'g'
bbmi = 'ChemBMI'

#Top X variables based on the absolute value of the mean beta-coefficient
tempDF['AbsMean'] = np.abs(tempDF['Mean'])
tempDF = tempDF.sort_values(by='AbsMean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.sort_values(by='Mean', ascending=False)#Re-sort
tempDF = tempDF.drop(columns=['Mean', 'SD', 'nZeros', 'AbsMean'])
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='bcoef', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
p = sns.boxplot(data=tempDF, y='Variable', x='bcoef', color=bbmi_color, dodge=False, saturation=1,
                showfliers=True, flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                showcaps=True, notch=False)
p.set(xlim=(-0.06, 0.09), xticks=np.arange(-0.04, 0.081, 0.04))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
##Change default dull line color of sns.boxplot (saturation parameter is for patch)
for line in p.get_lines():
    line.set_color('k')
for box in p.artists:
    box.set_edgecolor('k')
##Add background color
for row_i in range(topX):
    if row_i%2 == 0:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=bbmi_color, alpha=0.2, zorder=0)
plt.ylabel('')
plt.xlabel(r'$\beta$'+'-coefficient in '+method.lower()+' model\n [log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-bcoef_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 5-5. Combined omics

In [None]:
tempDF = bcoefDF_combi_ridge.copy()
topX = 30
method = 'Ridge'
#bbmi_color = 'm'
tempD1 = {'Metabolomics':bcoefDF_met_ridge.index.tolist(),
          'Proteomics':bcoefDF_prot_ridge.index.tolist(),
          'Clinical labs':bcoefDF_chem_ridge.index.tolist()}
tempD2 = {'Metabolomics':'b', 'Proteomics':'r', 'Clinical labs':'g'}
bbmi = 'CombiBMI'

#Top X variables based on the absolute value of the mean beta-coefficient
tempDF['AbsMean'] = np.abs(tempDF['Mean'])
tempDF = tempDF.sort_values(by='AbsMean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.sort_values(by='Mean', ascending=False)#Re-sort
tempDF = tempDF.drop(columns=['Mean', 'SD', 'nZeros', 'AbsMean'])
tempDF1 = tempDF.reset_index().melt(var_name='Model', value_name='bcoef', id_vars=['Variable'])

#Prepare category color
tempDF2 = pd.DataFrame(index=tempDF.index)
tempL1 = []
tempL2 = []
for row_n in tempDF2.index.tolist():
    count = 0#Initialize
    for category in tempD1.keys():
        if row_n in tempD1[category]:
            tempL1.append(category)
            tempL2.append(tempD2[category])
            count += 1
    if count==0:
        print('Check error!')
tempDF2['Category'] = tempL1
tempDF2['Color'] = tempL2

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
p = sns.boxplot(data=tempDF1, y='Variable', order=tempDF2.index.tolist(), x='bcoef',
                palette=tempDF2['Color'], dodge=False, saturation=1,
                showfliers=True, flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                showcaps=True, notch=False)
#p.set(xlim=(-0.06, 0.09), xticks=np.arange(-0.04, 0.081, 0.04))#Fixed across omics
p.set(xlim=(-0.021, 0.021), xticks=np.arange(-0.02, 0.021, 0.01))#Based on the LASSO figure
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
##Change default dull line color of sns.boxplot (saturation parameter is for patch)
for line in p.get_lines():
    line.set_color('k')
for box in p.artists:
    box.set_edgecolor('k')
##Add background color
for row_i in range(len(tempDF2)):
    cat_color = tempDF2['Color'].iloc[row_i]
    if row_i%2 == 0:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=cat_color, alpha=0.4, zorder=0)
    else:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=cat_color, alpha=0.4, zorder=0)
plt.ylabel('')
plt.xlabel(r'$\beta$'+'-coefficient in '+method.lower()+' model\n [log-scaled BMI (kg m'+r'$^{-2}$'+') per s.d.]')
#Add legend
tempL = []
for category in tempD1.keys():
    tempL.append(mpatches.Patch(facecolor=tempD2[category], edgecolor='k', label=category))
plt.legend(handles=tempL, fontsize='large',
           title='Omics category', title_fontsize='x-large',
           bbox_to_anchor=(-0.35, 0), loc='center right', borderaxespad=0)#Manual adjustment
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-bcoef_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

## 6. Top 30 variables in RF models

> Because RF model is not a linear model, it is not so simple to compare the predictor variables between LASSO and RF models.  
> –> Hence, only the top 30 variables that had the highest feature importance are checked.  

### 6-1. Import feature importance

> The feature importance that were obtained from sklearn.ensemble.RandomForestRegressor is based on impurity; i.e., the importance of a feature is computed as the (normalized) total reduction of the criterion (the mean squared error in this case) brought by that feature.  
> –> Hence, the sum of feature importances in all variables within the model becomes 1.  
>
> Of note, it is known that this impurity-based feature importance can be misleading for high cardinality features, and alternative index such as permutation importance and SHAP (SHapley Additive exPlanations) feature importance would be better. In this study, however, the impurity-based feature importance is used because all variables are continuous (and because of a supplementary result).  

In [None]:
#Random forest
print('Random forest models')
tempD = {}
for bbmi in ['MetBMI', 'ProtBMI', 'ChemBMI', 'CombiBMI']:
    #Import the feature importance
    fileDir = './ImportData/'
    ipynbName = '220828_Multiomics-BMI-NatMed1stRevision_BMI-baseline-RF-wenceslaus_'
    fileName = bbmi+'-BothSex-feature-importance.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempD[bbmi] = tempDF
    
    #Check
    print(bbmi+':')
    print(' - Variables:', len(tempDF))
    tempDF1 = tempDF.loc[tempDF['Mean']>0.01]
    print(' - Variables with the mean of feature importances > 0.01:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF.loc[tempDF['Mean']>0.05]
    print(' - Variables with the mean of feature importances > 0.05:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF.loc[:, tempDF.columns.str.contains('Model_')]
    display(tempDF1.sum(axis=0))
    print('')

bcoefDF_met_RF = tempD['MetBMI']
bcoefDF_prot_RF = tempD['ProtBMI']
bcoefDF_chem_RF = tempD['ChemBMI']
bcoefDF_combi_RF = tempD['CombiBMI']

### 6-2. Metabolomics

In [None]:
tempDF = bcoefDF_met_RF.copy()
topX = 30
method = 'RF'
bbmi_color = 'b'
bbmi = 'MetBMI'

#Top X variables based on feature importance
tempDF = tempDF.sort_values(by='Mean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.drop(columns=['Mean', 'SD'])
tempDF = tempDF * 100#Convert to percentage
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='importance', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
sns.barplot(data=tempDF, y='Variable', x='importance',
            color=bbmi_color, dodge=False, saturation=1,
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black')
p = sns.stripplot(data=tempDF, y='Variable', x='importance',
                  dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4)
p.set(xlim=(0, 32.5), xticks=np.arange(0, 30.1, 10))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
plt.ylabel('')
plt.xlabel('Feature importance in '+method+' model [%]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-importance_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 6-3. Proteomics

In [None]:
tempDF = bcoefDF_prot_RF.copy()
topX = 30
method = 'RF'
bbmi_color = 'r'
bbmi = 'ProtBMI'

#Top X variables based on feature importance
tempDF = tempDF.sort_values(by='Mean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.drop(columns=['Mean', 'SD'])
tempDF = tempDF * 100#Convert to percentage
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='importance', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
sns.barplot(data=tempDF, y='Variable', x='importance',
            color=bbmi_color, dodge=False, saturation=1,
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black')
p = sns.stripplot(data=tempDF, y='Variable', x='importance',
                  dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4)
p.set(xlim=(0, 32.5), xticks=np.arange(0, 30.1, 10))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
plt.ylabel('')
plt.xlabel('Feature importance in '+method+' model [%]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-importance_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 6-4. Clinical labs

In [None]:
tempDF = bcoefDF_chem_RF.copy()
topX = 30
method = 'RF'
bbmi_color = 'g'
bbmi = 'ChemBMI'

#Top X variables based on feature importance
tempDF = tempDF.sort_values(by='Mean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.drop(columns=['Mean', 'SD'])
tempDF = tempDF * 100#Convert to percentage
tempDF = tempDF.reset_index().melt(var_name='Model', value_name='importance', id_vars=['Variable'])

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
sns.barplot(data=tempDF, y='Variable', x='importance',
            color=bbmi_color, dodge=False, saturation=1,
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black')
p = sns.stripplot(data=tempDF, y='Variable', x='importance',
                  dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4)
p.set(xlim=(0, 32.5), xticks=np.arange(0, 30.1, 10))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
plt.ylabel('')
plt.xlabel('Feature importance in '+method+' model [%]')
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-importance_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 6-5. Combined omics

In [None]:
tempDF = bcoefDF_combi_RF.copy()
topX = 30
method = 'RF'
#bbmi_color = 'm'
tempD1 = {'Metabolomics':bcoefDF_met_RF.index.tolist(),
          'Proteomics':bcoefDF_prot_RF.index.tolist(),
          'Clinical labs':bcoefDF_chem_RF.index.tolist()}
tempD2 = {'Metabolomics':'b', 'Proteomics':'r', 'Clinical labs':'g'}
bbmi = 'CombiBMI'

#Top X variables based on feature importance
tempDF = tempDF.sort_values(by='Mean', ascending=False)
tempDF = tempDF.iloc[:topX]
print('Visualize top', len(tempDF), 'variables')

#Prepare DF for plot
tempDF = tempDF.drop(columns=['Mean', 'SD'])
tempDF = tempDF * 100#Convert to percentage
tempDF1 = tempDF.reset_index().melt(var_name='Model', value_name='importance', id_vars=['Variable'])

#Prepare category color
tempDF2 = pd.DataFrame(index=tempDF.index)
tempL1 = []
tempL2 = []
for row_n in tempDF2.index.tolist():
    count = 0#Initialize
    for category in tempD1.keys():
        if row_n in tempD1[category]:
            tempL1.append(category)
            tempL2.append(tempD2[category])
            count += 1
    if count==0:
        print('Check error!')
tempDF2['Category'] = tempL1
tempDF2['Color'] = tempL2

#Plot
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(5, 10))
sns.barplot(data=tempDF1, y='Variable', order=tempDF2.index.tolist(), x='importance',
            palette=tempDF2['Color'], dodge=False, saturation=1,
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black')
p = sns.stripplot(data=tempDF1, y='Variable', order=tempDF2.index.tolist(), x='importance',
                  dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4)
#p.set(xlim=(0, 32.5), xticks=np.arange(0, 30.1, 10))#Fixed across omics
p.grid(axis='x', linestyle='--', color='black')
sns.despine()
##Add background color
for row_i in range(len(tempDF2)):
    cat_color = tempDF2['Color'].iloc[row_i]
    if row_i%2 == 0:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=cat_color, alpha=0.4, zorder=0)
    else:
        plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=cat_color, alpha=0.4, zorder=0)
plt.margins(y=0)
plt.ylabel('')
plt.xlabel('Feature importance in '+method+' model [%]')
#Add legend
tempL = []
for category in tempD1.keys():
    tempL.append(mpatches.Patch(facecolor=tempD2[category], edgecolor='k', label=category))
plt.legend(handles=tempL, fontsize='large',
           title='Omics category', title_fontsize='x-large',
           bbox_to_anchor=(1, 0), loc='lower right', borderaxespad=0.25)#Manual adjustment
##Save
fileDir = './ExportFigures/'
ipynbName = '220901_Multiomics-BMI-NatMed1stRevision_bBMI-LASSO-vs-others_'
fileName = method+'-'+bbmi+'-importance_top'+str(topX)+'.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

# — End of this notebook —