# Multiomics BMI Paper — ∆BMI-based Misclassification in the TwinsUK Cohort

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) validated the findings about ∆BMI-based misclassification (i.e., BMI class vs. biological BMI class) with the TwinsUK cohort.  

Input files:  
* TwinsUK covariates: 220916_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataCleaning-ver3_general-data_final.tsv  
* TwinsUK MetBMI predictions: 220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_MetBMI-BothSex-TwinsUK.tsv  
* TwinsUK obesity-related features (preprocessed): 220919_Multiomics-BMI-NatMed1stRevision_TwinsUK-Preprocessing-ver2_preprocessed-phenotype-dataset.tsv  
* Arivale baseline MetBMI predictions (full panel version): 220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_biologicalBMI-baseline-summary-BothSex.tsv  
* Arivale baseline MetBMI predictions (restricted panel version): 220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_MetBMI-BothSex-Arivale.tsv  
* TwinsUK metabolic health condition: 220720_Multiomics-BMI-NatMedRevision_Misclassification_metabolic-health-summary.tsv  
* Arivale baseline metabolic health condition: 220720_Multiomics-BMI-NatMedRevision_Misclassification_metabolic-health-summary.tsv  

Output figures and tables:  
* Supplementary Figure 6  
* Tables for Supplementary Data 6, 10  

Original notebook (memo for my future tracing):  
* dalek:\[JupyterLab HOME\]/220621_Multiomics-BMI-NatMedRevision/220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time

from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
from statsmodels.stats import multitest as multi
from decimal import Decimal, ROUND_HALF_UP

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

## 1. Prepare datasets

### 1-1. ∆BMI-derived misclassification and covariates

In [None]:
#Import the cleaned dataframe
fileDir = './ExportData/'
ipynbName = '220916_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataCleaning-ver3_'
fileName = 'general-data_final.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', parse_dates=['VisitDate'])
tempDF = tempDF.set_index('KeyIndex')

display(tempDF)
print(' - Unique KeyIndex:', len(tempDF.index.unique()))
tempDF1 = tempDF.index.to_series().str.split(pat='_', expand=True)
print(' - Unique participant:', len(tempDF1.iloc[:, 0].unique()))

bmiDF = tempDF

In [None]:
tempDF = bmiDF
bbmi = 'MetBMI'

#Add bBMI prediction
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = bbmi+'-BothSex-TwinsUK.tsv'
tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t')
tempDF1 = tempDF1.set_index('KeyIndex')
tempDF = pd.merge(tempDF, tempDF1[bbmi], left_index=True, right_index=True, how='left')

#Add bBMI-based class
tempL = []
for value in tempDF[bbmi].tolist():
    if np.isnan(value):
        tempL.append('NotCalculated')
    elif value < 18.5:
        tempL.append('Underweight')
    elif value < 25:
        tempL.append('Normal')
    elif value < 30:
        tempL.append('Overweight')
    elif value >= 30:
        tempL.append('Obese')
    else:#Just in case
        tempL.append('Error?')
tempDF[bbmi+'_class'] = tempL
##Check
tempL = []
for bmi in ['BMI', bbmi]:
    tempS1 = tempDF[bmi+'_class'].value_counts()
    tempS2 = tempS1 / len(tempDF) * 100
    tempS2.name = tempS2.name+' [%]'
    tempDF1 = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='outer')
    tempL.append(tempDF1)
tempDF1 = pd.concat(tempL, axis=1)
display(tempDF1)

#Misclassification
tempL = [bbmi]
for bbmi in tempL:
    tempL1 = []
    for row_i in range(len(tempDF)):
        bmi_class = tempDF['BMI_class'].iloc[row_i]
        bbmi_class = tempDF[bbmi+'_class'].iloc[row_i]
        if bmi_class==bbmi_class:
            tempL1.append('Matched')
        else:
            tempL1.append('Mismatched')
    tempDF['vs_'+bbmi+'_class'] = tempL1

display(tempDF)
print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))
tempS = tempDF.groupby(by='BMI_class')['vs_'+bbmi+'_class'].value_counts()
display(tempS)

#Save
fileDir = './ExportData/'
ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
fileName = 'general-data-with-'+bbmi+'-BothSex-TwinsUK.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Update
bmiDF = tempDF

In [None]:
tempDF = bmiDF
bbmi = 'MetBMI'

#Check BMI
for bmi_class in ['Normal', 'Obese']:
    print(bmi_class)
    tempDF1 = tempDF.loc[tempDF['BMI_class']==bmi_class]
    display(tempDF1.groupby('vs_'+bbmi+'_class')['BMI'].describe())

> –> It would be safer to adjust the baseline BMI in the statistical tests.  

### 1-2. Obesity-related features

> Based on the data availability, the following 12 features are used.  
> * HDL-cholesterol  
> * LDL-cholesterol  
> * Triglycerides  
> * Glucose  
> * Insulin  
> * HOMA-IR  
> –> Note that the above features were also included for the model of standard clinical measures.  
> * ~~Glycohemoglobin (HbA1c)~~  
> * High-sensitivity CRP  
> * Adiponectin  
> * ~~Vitamin D (25(OH)D)~~  
> –> Note that all the above features were used as the obesity-related health markers in the Arivale analysis.  
> * Systolic blood pressure  
> * Diastolic blood pressure  
> –> Note that these two features were used as the BMI-associated physiological measures in the Arivale analysis.  
> * DEXA total fat percentage  
> * DEXA android-to-gynoid ratio  

In [None]:
#Import the preprocessed dataframe
fileDir = './ExportData/'
ipynbName = '220919_Multiomics-BMI-NatMed1stRevision_TwinsUK-Preprocessing-ver2_'
fileName = 'preprocessed-phenotype-dataset.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t')
tempDF = tempDF.set_index('KeyIndex')

display(tempDF)
print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))

featureDF = tempDF

In [None]:
tempDF = featureDF

#Prepare display labels for the target features
tempD = {'HDL-cholesterol':'HDL-cholesterol',
         'LDL-cholesterol':'LDL-cholesterol',
         'Triglycerides':'Triglycerides',
         'Glucose':'Glucose',
         'Insulin':'Insulin',
         'HOMA-IR':'HOMA-IR',
         'Hs-CRP':'Hs-CRP',
         'Adiponectin':'Adiponectin',
         'SystolicBP':'Systolic BP',
         'DiastolicBP':'Diastolic BP',
         'DEXA-TotalFatPercentage':'Percent total fat',
         'DEXA-AndroidGynoidRatio':'Android-to-gynoid'}
tempS = pd.Series(tempD, name='FeatureLabel')
tempS.index = tempS.index.rename('FeatureID')

#Add value range (Not necessary but just for making DF)
tempDF1 = tempDF.describe().T
tempDF1.index = tempDF1.index.rename('FeatureID')

#Merge
tempDF1 = pd.merge(tempS, tempDF1, left_index=True, right_index=True, how='left')

display(tempDF1)

featureDF_meta = tempDF1

## 2. Regression analysis for the obesity-related features

> Because the datasets are almost ready for use, only the simple processing steps are required for OLS linear regression:  
> * Missingness: To maximize the sample size for each regression, dropping NaN is performed after selecting feature.  
> * Centering: standardization is applied to both dependent and independent variables (including covariates).  
>
> Hence, the remaining processing steps are implemented during the for-loop for each regression. Of note, because the feature values were already preprocessed to reduce skewness through the data cleaning steps, OLS linear regression (i.e., GLM with Gaussian family) can be used simply.  

### 2-1. Perform OLS linear regression

> Model: Feature ~ b0 + b1\*C(Misclassification) + b2\*BMI + b3\*C(Sex) + b4\*Age  
> Main aim: Assess the difference in each feature between the matched and mismatched BMI class.  

> In this TwinsUK analysis, ancestry PCs are NOT included as the covariates due to data availability. Also, because only the MetBMI is available, the reference P-value adjustment 2 (across BMI classes and features within each bBMI) is skipped.  

In [None]:
tempDF1 = featureDF
tempDF2 = bmiDF
tempL1 = ['Normal', 'Obese']
tempL2 = ['MetBMI']
tempDF3 = featureDF_meta

t_start = time.time()
tempD1 = {}
for feature in tempDF1.columns.tolist():
    tempD2 = {}
    for bmi_class in tempL1:
        #Processing for OLS linear regression
        ##Gather all necessary variables into a single DF
        tempS = tempDF1[feature]
        tempDF = pd.merge(tempS, tempDF2, left_index=True, right_index=True, how='left')
        ##Select the target participants
        tempDF = tempDF.loc[tempDF['BMI_class']==bmi_class]
        ##Drop NaN in the feature values
        tempDF = tempDF.dropna()
        ##Z-score transformation
        tempDF4 = tempDF.select_dtypes(include=[np.number])
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        tempA = scaler.fit_transform(tempDF4)#Column direction
        tempDF4 = pd.DataFrame(data=tempA, index=tempDF4.index, columns=tempDF4.columns)
        ###Recover the categorical variables
        tempDF5 = tempDF.select_dtypes(exclude=[np.number])
        tempDF = pd.merge(tempDF4, tempDF5, left_index=True, right_index=True, how='left')
        ##Rename the dependent variable
        tempDF = tempDF.rename(columns={feature:'Feature'})
        ##Add a constant for the intercept
        ###–> In statsmodels, a constant is automatically added as well as R!
        
        tempD3 = {}
        for bbmi in tempL2:
            #Processing specific to each bBMI
            ##Sort to make bcoef = 0 and 1 for Matched and Mismatched
            tempDF = tempDF.sort_values(by='vs_'+bbmi+'_class', ascending=True)
            ##One-hot encoding for categorical covariates
            ###–> In statsmodels, categorical variables are automatically recognized!
            
            #OLS linear regression
            ##Fit univariate model
            formula = 'Feature ~ C(vs_'+bbmi+'_class)'
            fit_res1 = smf.ols(formula, data=tempDF).fit()
            ##Fit full model
            formula = 'Feature ~ C(vs_'+bbmi+'_class)'\
                '+ BMI + C(Sex) + Age'
            fit_res2 = smf.ols(formula, data=tempDF).fit()
            
            #Summarize the result
            tempS = pd.Series().astype('float64')
            ##Save the sample size for each group
            size1 = len(tempDF.loc[tempDF['vs_'+bbmi+'_class']=='Matched'])
            size2 = len(tempDF.loc[tempDF['vs_'+bbmi+'_class']=='Mismatched'])
            tempS.loc['vs'+bbmi+'class_nMatched'] = size1
            tempS.loc['vs'+bbmi+'class_nMismatched'] = size2
            ##Save R2 [%]
            tempS.loc['vs'+bbmi+'class_UnivarR2'] = fit_res1.rsquared*100
            tempS.loc['vs'+bbmi+'class_R2'] = fit_res2.rsquared*100
            ##Save beta-coefficient of the target variable
            tempS.loc['vs'+bbmi+'class_Bcoef'] = fit_res2.params['C(vs_'+bbmi+'_class)[T.Mismatched]']
            tempS.loc['vs'+bbmi+'class_BcoefSE'] = fit_res2.bse['C(vs_'+bbmi+'_class)[T.Mismatched]']
            ##Save t-statistic of the target variable
            tempS.loc['vs'+bbmi+'class_tStat'] = fit_res2.tvalues['C(vs_'+bbmi+'_class)[T.Mismatched]']
            ##Save P-value of the target variable
            tempS.loc['vs'+bbmi+'class_Pval'] = fit_res2.pvalues['C(vs_'+bbmi+'_class)[T.Mismatched]']
            ##Add dummy adjusted P-value rows for now
            tempS.loc['vs'+bbmi+'class_AdjPval_within1'] = 1.0
            #tempS.loc['vs'+bbmi+'class_AdjPval_within2'] = 1.0
            tempS.loc['vs'+bbmi+'class_AdjPval_all'] = 1.0
            
            tempD3[bbmi] = tempS
        
        #Clean the results (pd.Series) across bBMIs
        ##Prepare common summary metrics: sample size, residual degrees of freedom
        tempS1 = pd.Series().astype('float64')
        tempS1.loc['N'] = len(tempDF)
        tempS1.loc['DoF'] = int(fit_res2.df_resid)#Use the last result object but same b/w bBMIs
        ##Combine each result
        tempS2 = pd.concat(list(tempD3.values()), axis=0)
        tempS = pd.concat([tempS1, tempS2], axis=0)
        ##Convert to DF while transposing
        tempDF = pd.DataFrame(tempS.to_dict(), index=[0])
        ##Clean DF
        tempDF['N'] = tempDF['N'].astype('int64')
        tempDF['DoF'] = tempDF['DoF'].astype('int64')
        tempL = tempDF.loc[:, tempDF.columns.str.contains('_nM.*ed')].columns.tolist()
        for col_n in tempL:
            tempDF[col_n] = tempDF[col_n].astype('int64')
        tempDF['BMIclass'] = bmi_class
        
        tempD2[bmi_class] = tempDF
    
    #Clean the results (pd.DataFrame) across BMI classes
    tempDF = pd.concat(list(tempD2.values()), axis=0)
    tempDF['FeatureID'] = feature
    
    #P-value adjustment (across BMI classes and bBMIs within the feature) by using Benjamini–Hochberg method
    tempL = tempDF.loc[:, tempDF.columns.str.contains('_Pval')].columns.tolist()
    tempDF4 = tempDF.reset_index().melt(var_name='bBMI', value_name='Pval', value_vars=tempL,
                                        id_vars=['BMIclass', 'FeatureID'])
    tempDF4['AdjPval'] = multi.multipletests(tempDF4['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
    tempDF4 = tempDF4.pivot(index=['BMIclass', 'FeatureID'], columns='bBMI', values='AdjPval')
    tempDF4.columns = tempDF4.columns.str.replace('_Pval', '_AdjPval_within1')
    ##Replace the dummy values with the adjusted p-values
    tempL = [(bmi_class, feature) for bmi_class in tempL1]
    tempDF4 = tempDF4.loc[tempL]#Sort just in case
    tempL = tempDF.loc[:, tempDF.columns.str.contains('_AdjPval_within1')].columns.tolist()
    for col_n in tempL:
        tempDF[col_n] = tempDF4[col_n].tolist()
    
    tempD1[feature] = tempDF
t_elapsed = time.time() - t_start
print('Elapsed time for',
      len(tempDF1.columns)*len(tempL1)*len(tempL2), 'OLS linear regressions (',
      len(tempDF1.columns), 'features x',
      len(tempL1), 'BMI classes x',
      len(tempL2), 'bBMIs):',
      round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')

#Clean the results (pd.DataFrame) across features
tempDF = pd.concat(list(tempD1.values()), axis=0)
##Add the display label for visualization etc.
tempS = tempDF3['FeatureLabel']
tempDF = pd.merge(tempS, tempDF, on='FeatureID', how='right')
##Clean the column order by setting index
tempDF = tempDF.set_index(['BMIclass', 'FeatureID'])

#P-value adjustment (across BMI classes and features within each bBMI) by using Benjamini–Hochberg method
#for bbmi in tempL2:
#    tempDF['vs'+bbmi+'class_AdjPval_within2'] = multi.multipletests(tempDF['vs'+bbmi+'class_Pval'],
#                                                                    alpha=0.05, method='fdr_bh',
#                                                                    is_sorted=False, returnsorted=False)[1]

#P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempL = tempDF.loc[:, tempDF.columns.str.contains('_Pval')].columns.tolist()
tempDF4 = tempDF.reset_index().melt(var_name='bBMI', value_name='Pval', value_vars=tempL,
                                    id_vars=['BMIclass', 'FeatureID'])
tempDF4['AdjPval'] = multi.multipletests(tempDF4['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF4 = tempDF4.pivot(index=['BMIclass', 'FeatureID'], columns='bBMI', values='AdjPval')
tempDF4.columns = tempDF4.columns.str.replace('_Pval', '_AdjPval_all')
##Replace the dummy values with the adjusted p-values
tempL = tempDF.loc[:, tempDF.columns.str.contains('_AdjPval_all')].columns.tolist()
for col_n in tempL:
    tempDF[col_n] = tempDF4[col_n]

tempDF = tempDF.sort_index(axis=0, ascending=True, key=lambda x:x.str.lower())
display(tempDF)

#Save
fileDir = './ExportData/'
ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
fileName = 'regression-summary_feature.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

resDF = tempDF

### 2-2. Significantly different features by misclassification

In [None]:
tempDF = resDF
tempL1 = ['Normal', 'Obese']
tempL2 = ['MetBMI']

#Significantly different features
print('Significantly different features by misclassification (FDR < 0.05)')
tempD = {}
for bmi_class in tempL1:
    tempDF1 = tempDF.loc[bmi_class]#MultiIndex
    print(' - '+bmi_class)
    tempL = []
    for bbmi in tempL2:
        #Extact significant features
        tempDF2 = tempDF1.loc[tempDF1['vs'+bbmi+'class_AdjPval_all']<0.05]
        tempL.append(tempDF2.index.tolist())
        
        #Cf. Adjustment across BMI classes and bBMIs within the feature
        tempDF3 = tempDF1.loc[tempDF1['vs'+bbmi+'class_AdjPval_within1']<0.05]
        #Cf. Adjustment across BMI classes and features within each bBMI
        #tempDF4 = tempDF1.loc[tempDF1['vs'+bbmi+'class_AdjPval_within2']<0.05]
        #Cf. Nominal P-value
        tempDF5 = tempDF1.loc[tempDF1['vs'+bbmi+'class_Pval']<0.05]
        #tempL.append(tempDF5.index.tolist())
        
        print('   - '+bbmi+':', len(tempDF2),
              '(cf. within-adjustment 1:', len(tempDF3),
              #', within-adjustment 2:', len(tempDF4),
              ', nominal P-value:', len(tempDF5), ')')
    #Flatten and drop multiplicates
    tempL = list(set(row_n for sublist in tempL for row_n in sublist))
    tempDF1 = tempDF1.loc[tempL]
    
    print('   - Union:', len(tempL))
    tempD[bmi_class] = tempL
    
    #Clean (just for the display in Jupyter notebook)
    tempL3 = tempDF1.loc[:, tempDF1.columns.str.contains('Bcoef$')]
    tempL4 = tempDF1.loc[:, tempDF1.columns.str.contains('Pval')]
    tempL = [col_n for sublist in [tempL3, tempL4] for col_n in sublist]
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.sort_values(by='vsMetBMIclass_Pval', ascending=True)
    display(tempDF1)

#Flatten and drop multiplicates
tempL = list(set(row_n for sublist in tempD.values() for row_n in sublist))
print(' - Union:', len(tempL))
tempL = [(bmi_class, feature) for bmi_class in tempL1 for feature in tempL]
tempDF1 = tempDF.loc[tempL]
#Clean (just for the display in Jupyter notebook)
tempL3 = tempDF1.loc[:, tempDF1.columns.str.contains('Bcoef$')]
tempL4 = tempDF1.loc[:, tempDF1.columns.str.contains('Pval')]
tempL = [col_n for sublist in [tempL3, tempL4] for col_n in sublist]
tempDF1 = tempDF1[tempL]
tempDF1 = tempDF1.sort_values(by='vsMetBMIclass_Pval', ascending=True)
display(tempDF1)

### 2-3. Visualization

In [None]:
tempD1 = featureDF_meta['FeatureLabel'].to_dict()
tempD2 = {'Underweight':'blue', 'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
tempD3 = {'MetBMI':'b'}
tempD4 = {'HDL-cholesterol':'tab:blue',
          'LDL-cholesterol':'tab:red',
          'Triglycerides':'tab:red',
          'Glucose':'0.8',
          'Insulin':'0.8',
          'HOMA-IR':'tab:red',
          'Hs-CRP':'tab:red',
          'Adiponectin':'0.8',
          'SystolicBP':'0.8',
          'DiastolicBP':'0.8',
          'DEXA-TotalFatPercentage':'tab:red',
          'DEXA-AndroidGynoidRatio':'tab:red'}
tempL1 = ['Normal', 'Obese']
tempDF1 = featureDF
tempDF2 = bmiDF
tempDF3 = resDF

for feature_i, feature in enumerate(tempD1.keys()):
    #Prepare DF
    tempS = tempDF1[feature]
    tempDF = pd.merge(tempS, tempDF2, left_index=True, right_index=True, how='left')
    ##Drop NaN in the feature values
    tempDF = tempDF.dropna()
    ##Z-score transformation (based on the whole distribution)
    tempDF4 = tempDF.select_dtypes(include=[np.number])
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempA = scaler.fit_transform(tempDF4)#Column direction
    tempDF4 = pd.DataFrame(data=tempA, index=tempDF4.index, columns=tempDF4.columns)
    ###Recover the categorical variables
    tempDF5 = tempDF.select_dtypes(exclude=[np.number])
    tempDF = pd.merge(tempDF4, tempDF5, left_index=True, right_index=True, how='left')
    
    #Check sample size
    print(tempD1[feature])
    print('N (total):', len(tempDF))
    print(' - BMI class:', tempDF['BMI_class'].value_counts().sort_index(ascending=True).to_dict())
    for bmi_class in tempL1:
        print('   - '+bmi_class+' BMI class')
        for bbmi in tempD3.keys():
            tempS = tempDF['vs_'+bbmi+'_class'].loc[tempDF['BMI_class']==bmi_class]
            print('     - vs. '+bbmi+' class:',
                  tempS.value_counts().sort_index(ascending=True).to_dict())
    
    #Visualization
    sns.set(style='ticks', font='Arial', context='talk')
    fig, axes = plt.subplots(nrows=1, ncols=1+len(tempD3),
                             figsize=(3.5, 3), sharex=False, sharey=True,
                             gridspec_kw={'width_ratios':[1, 1]})
    axis_ymin = -4.4
    axis_ymax = 4.4
    ymin = -4
    ymax = 3
    yinter = 1
    margin = 0.49
    #Set shared axis range
    plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
    tempL = []#For legend handles and labels
    for ax_i, ax in enumerate(axes.flat):
        if ax_i==0:#Overall
            sns.boxplot(data=tempDF, y=feature, x='BMI_class', order=tempD2.keys(),
                        hue='BMI_class', hue_order=tempD2.keys(), dodge=False, palette=tempD2,
                        showfliers=False,#flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                        showcaps=True, notch=True, ax=ax)
            tempL.append(ax.get_legend_handles_labels())
        else:#Misclassification
            bbmi = list(tempD3.keys())[ax_i-1]
            tempDF4 = tempDF.loc[tempDF['BMI_class'].isin(tempL1)]
            tempD = {'Matched':'0.8', 'Mismatched':tempD3[bbmi]}
            sns.boxplot(data=tempDF4, y=feature, x='BMI_class', order=tempL1,
                        hue='vs_'+bbmi+'_class', hue_order=tempD.keys(), dodge=True, palette=tempD,
                        showfliers=False,#flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                        showcaps=True, notch=True, ax=ax)
            tempL.append(ax.get_legend_handles_labels())
        #Axis settings
        if ax_i==0:
            plt.setp(ax, xlim=(0-margin, len(tempD2)-1+margin))#To eliminate excess white space
            plt.setp(ax, xlabel='', ylabel='Tarnsformed value [a.u.]\n('+r'$Z$'+'-score)')
        else:
            plt.setp(ax, xlim=(0-margin, len(tempD)-1+margin))#To eliminate excess white space
            plt.setp(ax.get_yticklabels(), visible=False)
            plt.setp(ax, xlabel='', ylabel='')
        #ax.grid(axis='y', linestyle='--', color='black')
        sns.despine()
        plt.setp(ax.get_xticklabels(), rotation=70,
                 horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
        #Annotation
        lines = ax.get_lines()#Line2D: [[Q1, Q1-1.5IQR], [Q3, Q3+1.5IQR], [Q1, Q1], [Q3, Q3], [Med, Med], [flier]]
        if ax_i!=0:
            #P-value annotation
            lines_unit = 5 + int(False)#showfliers=False
            for class_i in range(len(tempL1)):
                #Matched
                whisker_0 = lines[class_i*lines_unit*len(tempD) + lines_unit*0 + 1]
                xcoord_0 = whisker_0._x[1]#Q3+1.5IQR
                ycoord_0 = whisker_0._y[1]#Q3+1.5IQR
                #Mismatched
                whisker_1 = lines[class_i*lines_unit*len(tempD) + lines_unit*1 + 1]
                xcoord_1 = whisker_1._x[1]#Q3+1.5IQR
                ycoord_1 = whisker_1._y[1]#Q3+1.5IQR
                #Standard point for annotation
                xcoord = (xcoord_0+xcoord_1)/2
                ycoord = max(ycoord_0, ycoord_1)
                #Add annotation lines
                aline_offset = yinter/5
                aline_length = yinter/5 + aline_offset/2
                ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                        [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                        lw=1.5, c='k')
                #Retrieve P-value
                bmi_class = tempL1[class_i]
                pval = tempDF3.loc[(bmi_class, feature), 'vs'+bbmi+'class_AdjPval_all']
                if pval<0.001:
                    label = '***'
                elif pval<0.01:
                    label = '**'
                elif pval<0.05:
                    label = '*'
                else:
                    pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
                    label = r'$P$'+' = '+pval_text
                #Add annotation text
                if label in ['***', '**', '*']:
                    text_offset = yinter/12
                    text_size = 'medium'
                else:
                    text_offset = yinter/3
                    text_size = 'x-small'
                ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                            horizontalalignment='center', verticalalignment='bottom',
                            fontsize=text_size, color='k')
        #Facet settings
        if ax_i==0:
            ax.set_title('Overall', {'fontsize':'medium'})
        else:
            ax.set_title(bbmi, {'fontsize':'medium'})
            xoff = 0.025
            yoff = 0.01
            rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.15,#Manual adjustment
                                 transform=ax.transAxes, facecolor=tempD3[bbmi], alpha=0.3,
                                 clip_on=False, linewidth=0, zorder=0.5)
            ax.add_patch(rect)
        #Change the default boxplot settings
        for line in lines:
            line.set_color('k')
        for box in ax.artists:
            box.set_edgecolor('k')
        #Remove the default legend
        ax.get_legend().remove()
        #Save ax position for figure title
        if ax_i==0:
            ax_pos_l = ax.get_position().bounds
        elif ax_i==len(tempD3):
            ax_pos_r = ax.get_position().bounds
    #Add legend manually
    for legend_i in range(len(tempL)):
        h, l = tempL[legend_i]
        if legend_i==0:
            title_text = 'BMI class (overall)'
            position = (0.95, 0.9)
        else:
            bbmi =list(tempD3.keys())[legend_i-1]
            title_text = 'vs. '+bbmi+' class'
            position = (0.95, 0.3)
        legend = fig.legend(handles=h, labels=l, fontsize='medium',
                            title=title_text, title_fontsize='medium',
                            bbox_to_anchor=position, loc='upper left',
                            labelspacing=0.25, handletextpad=0.5,
                            borderaxespad=0.0, frameon=False)
        plt.gca().add_artist(legend)
    #Add figure title
    xcoord_0 = ax_pos_l[0]
    xcoord_1 = ax_pos_r[0]+ax_pos_r[2]
    ycoord = ax_pos_l[1]+ax_pos_l[3]
    yoff = 0.15
    if tempD4[feature] in ['tab:red', 'tab:blue']:
        text_color = 'white'
    else:
        text_color = 'black'
    fig.suptitle(tempD1[feature], x=(xcoord_0+xcoord_1)/2, y=ycoord+yoff,
                 fontsize='large', fontweight='bold', color=text_color,
                 horizontalalignment='center', verticalalignment='bottom')
    yoff = yoff - 0.015#Manual adjustment
    rect = plt.Rectangle((xcoord_0, ycoord+yoff), xcoord_1-xcoord_0, 0.125,#Manual adjustment
                         transform=fig.transFigure, facecolor=tempD4[feature],
                         clip_on=False, linewidth=0, zorder=0)
    fig.patches.extend([rect])
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
    fileName = 'Feature'+str(feature_i+1).zfill(2)+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    print('')

## 3. ∆BMI-derived misclassification rate

### 3-1. Prepare the misclassification in Arivale

> To directly compare between Arivale and TwinsUK, MetBMI calculated from the restricted version is prioritized for Arivale. At the same time, MetBMI calculated from the full version is also prepared as reference; in fact, the final figure has no P-value and thus doesn't require P-value adjustment.  

In [None]:
bbmi = 'MetBMI'

print('Restricted version:')
#Import the Arivale bBMI predicted by restricted version model
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = bbmi+'-BothSex-Arivale.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')

#Add BMI and bBMI-based class
for bmi in ['BMI', bbmi]:
    tempL = []
    for value in tempDF[bmi].tolist():
        if np.isnan(value):
            tempL.append('NotCalculated')
        elif value < 18.5:
            tempL.append('Underweight')
        elif value < 25:
            tempL.append('Normal')
        elif value < 30:
            tempL.append('Overweight')
        elif value >= 30:
            tempL.append('Obese')
        else:#Just in case
            tempL.append('Error?')
    tempDF[bmi+'_class'] = tempL
##Check
tempL = []
for bmi in ['BMI', bbmi]:
    tempS1 = tempDF[bmi+'_class'].value_counts()
    tempS2 = tempS1 / len(tempDF) * 100
    tempS2.name = tempS2.name+' [%]'
    tempDF1 = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='outer')
    tempL.append(tempDF1)
tempDF1 = pd.concat(tempL, axis=1)
display(tempDF1)

#Misclassification
tempL = [bbmi]
for bbmi in tempL:
    tempL1 = []
    for row_i in range(len(tempDF)):
        bmi_class = tempDF['BMI_class'].iloc[row_i]
        bbmi_class = tempDF[bbmi+'_class'].iloc[row_i]
        if bmi_class==bbmi_class:
            tempL1.append('Matched')
        else:
            tempL1.append('Mismatched')
    tempDF['vs_'+bbmi+'_class'] = tempL1

display(tempDF)
print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))
tempS = tempDF.groupby(by='BMI_class')['vs_'+bbmi+'_class'].value_counts()
display(tempS)

bmiDF_a_restricted = tempDF

In [None]:
bbmi = 'MetBMI'

print('Full version:')
#Import the Arivale bBMI predicted by full version model
fileDir = './ExportData/'
ipynbName = '220803_Multiomics-BMI-NatMed1stRevision_DeltaBMI-misclassification_'
fileName = 'biologicalBMI-baseline-summary-BothSex.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')

#Change column names
tempDF.columns = tempDF.columns.str.replace('Base', '')

#Check
tempL = []
for bmi in ['BMI', bbmi]:
    tempS1 = tempDF[bmi+'_class'].value_counts()
    tempS2 = tempS1 / len(tempDF) * 100
    tempS2.name = tempS2.name+' [%]'
    tempDF1 = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='outer')
    tempL.append(tempDF1)
tempDF1 = pd.concat(tempL, axis=1)
display(tempDF1)

#Misclassification
tempL = [bbmi]
for bbmi in tempL:
    tempL1 = []
    for row_i in range(len(tempDF)):
        bmi_class = tempDF['BMI_class'].iloc[row_i]
        bbmi_class = tempDF[bbmi+'_class'].iloc[row_i]
        if bmi_class==bbmi_class:
            tempL1.append('Matched')
        else:
            tempL1.append('Mismatched')
    tempDF['vs_'+bbmi+'_class'] = tempL1

display(tempDF)
print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))
tempS = tempDF.groupby(by='BMI_class')['vs_'+bbmi+'_class'].value_counts()
display(tempS)

bmiDF_a_full = tempDF

### 3-2. Misclassification rate

In [None]:
tempD1 = {'Arivale (full)':bmiDF_a_full,
          'Arivale (restricted)':bmiDF_a_restricted,
          'TwinsUK':bmiDF}
tempL1 = ['MetBMI']
tempL2 = ['Underweight', 'Normal', 'Overweight', 'Obese']
tempD2 = {'Arivale (full): vs. MetBMI class':'Arivale (full)',
          'Arivale (restricted): vs. MetBMI class':'Arivale (restricted)',
          'TwinsUK: vs. MetBMI class':'TwinsUK'}
tempD3 = {'Arivale (full)':plt.get_cmap('tab20')(1),
          'Arivale (restricted)':'tab:blue',
          'TwinsUK':'tab:orange'}
legend_title = 'vs. MetBMI class'

#Calculate misclassification rate based on each biological BMI class
tempD = {}
for cohort in tempD1.keys():
    for bbmi in tempL1:
        tempDF = tempD1[cohort]
        
        #Just in case
        tempDF1 = tempDF.loc[tempDF[bbmi+'_class']=='NotCalculated']
        if len(tempDF1)>0:
            print('Check NotCalculated in '+cohort+' '+bbmi+' class.')
            tempDF = tempDF.loc[tempDF['Base'+bbmi+'_class']!='NotCalculated']
        
        #Count misclassification
        counter0 = 0
        counter1 = 0
        counter2 = 0
        counter3 = 0
        counter4 = 0
        for row_n in tempDF.index.tolist():
            bmi_class = tempDF.loc[row_n, 'BMI_class']
            bbmi_class = tempDF.loc[row_n, bbmi+'_class']
            if bmi_class!=bbmi_class:
                counter0 += 1#Overall
                if bmi_class=='Underweight':
                    counter1 += 1
                elif bmi_class=='Normal':
                    counter2 += 1
                elif bmi_class=='Overweight':
                    counter3 += 1
                elif bmi_class=='Obese':
                    counter4 += 1
                else:#Just in case
                    print('Error?')
        
        #Clean the count result
        tempS = pd.Series(name=cohort+': vs. '+bbmi+' class')
        tempS.loc['Overall count'] = counter0
        tempS.loc['Overall [%]'] = counter0/len(tempDF)*100
        tempL = [counter1, counter2, counter3, counter4]
        for counter_i in range(len(tempL)):
            bmi_class = tempL2[counter_i]#Because of the same order
            counter = tempL[counter_i]
            tempS.loc[bmi_class+' count'] = counter
            total = len(tempDF.loc[tempDF['BMI_class']==bmi_class])
            tempS.loc[bmi_class+' [%]'] = counter / total * 100
        tempD[cohort+': vs. '+bbmi+' class'] = tempS
tempDF = pd.concat(list(tempD.values()), axis=1)
display(tempDF)

#Plot (without underweight)
##Prepare DF
tempDF = tempDF.loc[['Overall [%]', 'Normal [%]', 'Overweight [%]', 'Obese [%]']]
tempDF.index = tempDF.index.str.replace(' \[%\]', '')
tempDF.columns = tempDF.columns.map(tempD2)
tempDF = tempDF.reset_index().melt(var_name='Category', value_name='Misclassification', id_vars='index')
##Visualization
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(3, 3))
sns.barplot(data=tempDF, x='index', y='Misclassification',
            hue='Category', dodge=True, palette=tempD3, edgecolor='black')
sns.despine()
plt.yticks(np.arange(0, 61, 10))
plt.axvline(x=(0+1)/2, **{'linestyle':'--', 'color':'k'})
##Add reference range
plt.axhspan(ymin=28, ymax=48, facecolor='crimson', alpha=0.2, zorder=0)
plt.axhline(y=28, **{'linestyle':'-', 'color':'crimson', 'zorder':0})
plt.axhline(y=48, **{'linestyle':'-', 'color':'crimson', 'zorder':0})
plt.ylabel('Misclassification [%]')
#plt.xlabel('BMI class')
plt.xlabel('')
plt.xticks(rotation=70,
           horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
plt.legend(title=legend_title, bbox_to_anchor=(1, -0.6), loc='upper right',
           handletextpad=0.5, borderaxespad=0)
##Save
fileDir = './ExportFigures/'
ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
fileName = 'misclassification-rate.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

## 4. ∆BMI and clinical definition-based misclassification

### 4-1. Calculate ∆BMI and prepare covariates

In [None]:
tempD1 = {'Arivale':bmiDF_a_restricted,#Restricted version of MetBMI
          'TwinsUK':bmiDF}
tempL1 = ['MetBMI']

tempD2 = {}
for cohort in tempD1.keys():
    tempDF = tempD1[cohort]
    #Calculate the rate of difference
    for bbmi in tempL1:
        tempDF['Delta'+bbmi] = (tempDF[bbmi] - tempDF['BMI']) / tempDF['BMI'] * 100
    tempD2[cohort] = tempDF
    
    #Check skewness
    tempDF1 = tempDF.select_dtypes(include=[np.number])
    tempDF2 = tempDF1.describe()
    tempDF2.loc['Skewness'] = stats.skew(tempDF1)
    print(cohort)
    display(tempDF2)
#Update
bmiDF_a_restricted = tempD2['Arivale']
bmiDF = tempD2['TwinsUK']

> –> ∆BMI can be surly assumed as normal distribution.  

In [None]:
tempDF1 = bmiDF_a_restricted#Restricted version of MetBMI
tempDF2 = bmiDF_a_full
tempL = ['Sex', 'Age', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']

#Add covariates
tempDF = pd.merge(tempDF1, tempDF2[tempL], left_index=True, right_index=True, how='left')
display(tempDF.describe(include='all'))
print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))

#Update
bmiDF_a_restricted = tempDF

### 4-2. Metabolic health condiiton

In [None]:
tempD1 = {'Arivale':'220720_Multiomics-BMI-NatMedRevision_Misclassification_',
          'TwinsUK':'220919_Multiomics-BMI-NatMed1stRevision_TwinsUK-Preprocessing-ver2_'}
tempD2 = {'Arivale':'public_client_id',
          'TwinsUK':'KeyIndex'}

tempD3 = {}
for cohort in tempD1.keys():
    #Import cleaned table for metabolic health condition
    fileDir = './ExportData/'
    ipynbName = tempD1[cohort]
    fileName = 'metabolic-health-summary.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={tempD2[cohort]: str})
    tempDF = tempDF.set_index(tempD2[cohort])
    tempD3[cohort] = tempDF
    
    print(cohort)
    display(tempDF)
    display(tempDF.describe(include='all'))
    print('NaN in DF:', tempDF.isnull().to_numpy().sum(axis=None))
    print('')
metabDF_a = tempD3['Arivale']
metabDF_t = tempD3['TwinsUK']

> –> 6 participants in Arivale and 1,203 participants in TwinsUK had NaN for the metabolic health condition.  

In [None]:
tempD1 = {'Arivale':bmiDF_a_restricted,#Restricted version of MetBMI
          'TwinsUK':bmiDF}
tempD2 = {'Arivale':metabDF_a,
          'TwinsUK':metabDF_t}

#Check BMI between metabolically healthy and unhealthy groups
for cohort in tempD1.keys():
    print(cohort)
    tempDF1 = tempD1[cohort]
    tempDF2 = tempD2[cohort]
    tempDF = pd.merge(tempDF1, tempDF2['Metabolically'],
                      left_index=True, right_index=True, how='inner')
    for bmi_class in ['Normal', 'Obese']:
        tempDF3 = tempDF.loc[tempDF['BMI_class']==bmi_class]
        print(bmi_class+': n =', len(tempDF3))
        display(tempDF3.groupby('Metabolically')['BMI'].describe())
    print('')

> –> It would be safer to adjust the baseline BMI in the statistical tests.  

### 4-3. Perform OLS linear regression

> Because ∆BMI values are assumed as normal distribution, OLS linear regression (i.e., GLM with Gaussian family) can be used simply.  

> Model: ∆BMI ~ b0 + b1\*C(MetabolicCondition) + b2\*BMI + b3\*C(Sex) + b4\*Age + b5\*AncestryPCs  
> Main aim: Assess the difference in each ∆BMI between the metabolically healthy and unhealthy groups.  
>
> In this TwinsUK analysis, ancestry PCs are NOT included as the covariates due to data availability. Also, because only the MetBMI is assessed, the reference P-value adjustment (across bBMI classes within BMI class and within cohort) is skipped.  

In [None]:
tempD1 = {'Arivale':bmiDF_a_restricted,#Restricted version of MetBMI
          'TwinsUK':bmiDF}
tempD2 = {'Arivale':metabDF_a,
          'TwinsUK':metabDF_t}
tempD3 = {'Arivale':'+ BMI + C(Sex) + Age + PC1 + PC2 + PC3 + PC4 + PC5',
          'TwinsUK':'+ BMI + C(Sex) + Age'}
tempL1 = ['Normal', 'Obese']
tempL2 = ['MetBMI']

tempD4 = {}
for cohort in tempD1.keys():
    tempDF1 = tempD1[cohort]
    tempDF2 = tempD2[cohort]
    
    t_start = time.time()
    tempD5 = {}
    for bmi_class in tempL1:
        #Processing for OLS linear regression
        ##Gather all necessary variables into a single DF
        tempS = tempDF2['Metabolically']
        tempDF = pd.merge(tempDF1, tempS, left_index=True, right_index=True, how='left')
        ##Select the target participants
        tempDF = tempDF.loc[tempDF['BMI_class']==bmi_class]
        ##Drop NaN in the metabolic health condition
        tempDF = tempDF.dropna()
        ##Z-score transformation
        tempDF3 = tempDF.select_dtypes(include=[np.number])
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        tempA = scaler.fit_transform(tempDF3)#Column direction
        tempDF3 = pd.DataFrame(data=tempA, index=tempDF3.index, columns=tempDF3.columns)
        ###Recover the categorical variables
        tempDF4 = tempDF.select_dtypes(exclude=[np.number])
        tempDF = pd.merge(tempDF3, tempDF4, left_index=True, right_index=True, how='left')
        ##Add a constant for the intercept
        ###–> In statsmodels, a constant is automatically added as well as R!
        ##Sort to make bcoef = 0 and 1 for Healthy and Unhealthy
        tempDF = tempDF.sort_values(by='Metabolically', ascending=True)
        ##One-hot encoding for categorical covariates
        ###–> In statsmodels, categorical variables are automatically recognized!
        
        tempD6 = {}
        for bbmi in tempL2:
            #OLS linear regression
            ##Fit univariate model
            formula = 'Delta'+bbmi+' ~ C(Metabolically)'
            fit_res1 = smf.ols(formula, data=tempDF).fit()
            ##Fit full model
            formula = 'Delta'+bbmi+' ~ C(Metabolically)'+tempD3[cohort]
            fit_res2 = smf.ols(formula, data=tempDF).fit()
            
            #Summarize the result
            if cohort=='Arivale':
                tempDF3 = pd.DataFrame({'DeltaBMI':[bbmi+' (restricted)']})
            else:
                tempDF3 = pd.DataFrame({'DeltaBMI':[bbmi]})
            ##Save the sample size for each group
            tempDF3['N'] = len(tempDF)
            tempDF3['nHealthy'] = len(tempDF.loc[tempDF['Metabolically']=='Healthy'])
            tempDF3['nUnhealthy'] = len(tempDF.loc[tempDF['Metabolically']=='Unhealthy'])
            ##Save R2 [%]
            tempDF3['UnivarR2'] = fit_res1.rsquared*100
            tempDF3['R2'] = fit_res2.rsquared*100
            ##Save beta-coefficient of the target variable
            tempDF3['Bcoef'] = fit_res2.params['C(Metabolically)[T.Unhealthy]']
            tempDF3['BcoefSE'] = fit_res2.bse['C(Metabolically)[T.Unhealthy]']
            ##Save t-statistic of the target variable
            tempDF3['tStat'] = fit_res2.tvalues['C(Metabolically)[T.Unhealthy]']
            ##Save residual degrees of freedom
            tempDF3['DoF'] = int(fit_res2.df_resid)
            ##Save P-value of the target variable
            tempDF3['Pval'] = fit_res2.pvalues['C(Metabolically)[T.Unhealthy]']
            tempD6[bbmi] = tempDF3
        
        #Clean the results (pd.DataFrame) across bBMIs
        tempDF = pd.concat(list(tempD6.values()), axis=0)
        tempDF['BMIclass'] = bmi_class
        tempD5[bmi_class] = tempDF
    t_elapsed = time.time() - t_start
    print(cohort)
    print('Elapsed time for',
          len(tempL1)*len(tempL2), 'OLS linear regressions (',
          len(tempL1), 'BMI classes x',
          len(tempL2), 'bBMIs):',
          round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
    
    #Clean the results (pd.DataFrame) across BMI classes
    tempDF = pd.concat(list(tempD5.values()), axis=0)
    tempDF['Cohort'] = cohort
    ##P-value adjustment (across BMI classes within cohort) by using Benjamini–Hochberg method
    tempDF['AdjPval_within'] = multi.multipletests(tempDF['Pval'], alpha=0.05, method='fdr_bh',
                                                   is_sorted=False, returnsorted=False)[1]
    tempD4[cohort] = tempDF
#Clean the results (pd.DataFrame) across cohorts
tempDF = pd.concat(list(tempD4.values()), axis=0)
##Clean the column order by setting index
tempDF = tempDF.set_index(['Cohort', 'BMIclass', 'DeltaBMI'])
#P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF['AdjPval_all'] = multi.multipletests(tempDF['Pval'], alpha=0.05, method='fdr_bh',
                                            is_sorted=False, returnsorted=False)[1]

display(tempDF)

#Save
fileDir = './ExportData/'
ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
fileName = 'regression-summary_DeltaBMI.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

resDF = tempDF

### 4-4. Visualization

In [None]:
bbmi = 'MetBMI'
tempD1 = {'Arivale':'tab:blue', 'TwinsUK':'tab:orange'}
tempD2 = {'Healthy':'0.8', 'Unhealthy':'crimson'}
tempL1 = ['Normal', 'Obese']
tempD3 = {'Arivale':bmiDF_a_restricted,#Restricted version of MetBMI
          'TwinsUK':bmiDF}
tempD4 = {'Arivale':metabDF_a,
          'TwinsUK':metabDF_t}
tempDF3 = resDF

#Visualization
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD1),
                         figsize=(3.5, 3), sharex=True, sharey=True,
                         gridspec_kw={'width_ratios':[1, 1]})
axis_ymin = -35
axis_ymax = 55
ymin = -30
ymax = 45
yinter = 15
margin = 0.49
#Set shared axis range
plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
plt.setp(axes, xlim=(0-margin, len(tempD2)-1+margin))#To eliminate excess white space
for ax_i, ax in enumerate(axes.flat):
    cohort = list(tempD1.keys())[ax_i]
    tempDF1 = tempD3[cohort]
    tempDF2 = tempD4[cohort]
    #Prepare DF
    tempS = tempDF2['Metabolically']
    tempDF = pd.merge(tempDF1, tempS, left_index=True, right_index=True, how='left')
    ##Select the target participants
    tempDF = tempDF.loc[tempDF['BMI_class'].isin(tempL1)]
    ##Drop NaN in the metabolic health condition
    tempDF = tempDF.dropna()
    #Check sample size
    print('N (total):', len(tempDF))
    print(' - BMI class:', tempDF['BMI_class'].value_counts().sort_index(ascending=True).to_dict())
    for bmi_class in tempL1:
        tempDF1 = tempDF.loc[tempDF['BMI_class']==bmi_class]
        print('   - '+bmi_class+' BMI class - Metabolic condition:',
              tempDF1['Metabolically'].value_counts().sort_index(ascending=True).to_dict())
    #Plot
    sns.boxplot(data=tempDF, y='Delta'+bbmi, x='BMI_class', order=tempL1,
                hue='Metabolically', hue_order=tempD2.keys(), dodge=True, palette=tempD2,
                showfliers=False,#flierprops={'marker':'o', 'markerfacecolor':'gray', 'alpha':0.4},
                showcaps=True, notch=True, ax=ax)
    #Axis setting
    if ax_i==0:
        plt.setp(ax, xlabel='', ylabel=r'$\Delta$'+bbmi+' [% BMI]')
    else:
        plt.setp(ax.get_yticklabels(), visible=False)
        plt.setp(ax, xlabel='', ylabel='')
    sns.despine()
    plt.setp(ax.get_xticklabels(), rotation=70,
             horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
    #P-value annotation
    lines = ax.get_lines()#Line2D: [[Q1, Q1-1.5IQR], [Q3, Q3+1.5IQR], [Q1, Q1], [Q3, Q3], [Med, Med], [flier]]
    lines_unit = 5 + int(False)#showfliers=False
    for class_i in range(len(tempL1)):
        #Healthy
        whisker_0 = lines[class_i*lines_unit*len(tempD2) + lines_unit*0 + 1]
        xcoord_0 = whisker_0._x[1]#Q3+1.5IQR
        ycoord_0 = whisker_0._y[1]#Q3+1.5IQR
        #Unhealthy
        whisker_1 = lines[class_i*lines_unit*len(tempD2) + lines_unit*1 + 1]
        xcoord_1 = whisker_1._x[1]#Q3+1.5IQR
        ycoord_1 = whisker_1._y[1]#Q3+1.5IQR
        #Standard point for annotation
        xcoord = (xcoord_0+xcoord_1)/2
        ycoord = max(ycoord_0, ycoord_1)
        #Add annotation lines
        aline_offset = yinter/5
        aline_length = yinter/5 + aline_offset/2
        ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                lw=1.5, c='k')
        #Retrieve P-value
        bmi_class = tempL1[class_i]
        pval = tempDF3.loc[(cohort, bmi_class), 'AdjPval_all'].iloc[0]#3-level MultiIndex
        if pval<0.001:
            label = '***'
        elif pval<0.01:
            label = '**'
        elif pval<0.05:
            label = '*'
        else:
            pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
            label = r'$P$'+' = '+pval_text
        #Add annotation text
        if label in ['***', '**', '*']:
            text_offset = yinter/12
            text_size = 'medium'
        else:
            text_offset = yinter/3
            text_size = 'x-small'
        ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                    horizontalalignment='center', verticalalignment='bottom',
                    fontsize=text_size, color='k')
    #Facet settings
    ax.set_title(cohort, {'fontsize':'medium'})
    xoff = 0.025
    yoff = 0.01
    rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.15,#Manual adjustment
                         transform=ax.transAxes, facecolor=tempD1[cohort], alpha=0.3,
                         clip_on=False, linewidth=0, zorder=0.5)
    ax.add_patch(rect)
    #Change the default boxplot settings
    for line in lines:
        line.set_color('k')
    for box in ax.artists:
        box.set_edgecolor('k')
    #Legend
    if ax_i==len(tempD1)-1:
        ax.legend(title='Metabolic condition', title_fontsize='medium', fontsize='medium',
                  bbox_to_anchor=(1, -0.45), loc='upper right', borderaxespad=0, ncol=2,
                  handlelength=1.5, handletextpad=0.5, columnspacing=1.0)
    else:
        ax.get_legend().remove()
##Save
fileDir = './ExportFigures/'
ipynbName = '220920_Multiomics-BMI-NatMed1stRevision_TwinsUK-Misclassification-DeltaBMI-ver2_'
fileName = 'DeltaBMI-all.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

# — End of this notebook —