# Multiomics BMI Paper — BMI Predictions for the TwinsUK Cohort Using the Arivale-fitted LASSO Models

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) generated the LASSO linear regression models for predicting BMI (biological BMI) from the restricted Arivale metabolomics panel, and calculated BMI predictions for the TwinsUK cohort.  

Input files:  
* Arivale baseline BMI and blood omics (preprocessed): 210104_Biological-BMI-paper_RF-imputation_baseline-\[metDF/chemDF/combiDF\]-with-RF-imputation.tsv  
* TwinsUK BMI and clinical labs (preprocessed): 220916_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataCleaning-ver3_\[general-data/standard-measures\]_final.tsv  
* TwinsUK metabolomics (preprocessed): 220917_Multiomics-BMI-NatMed1stRevision_TwinsUK-RFimputation-wenceslaus-ver3_serum-metabolomics_final-with-RFimputation.tsv  
* TwinsUK metabolomics metadata: 220731_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataFormatting_cleaned-metabolomics-metadata.tsv  
* Arivale-fitted MetBMI models (full panel version): 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_MetBMI-\[Female/Male/BothSex\]-LASSObcoefs.tsv  
* Arivale-fitted StandBMI models: 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_StandBMI-\[Female/Male/BothSex\]-OLSbcoefs.tsv  
* Arivale baseline MetBMI predictions (full panel version): 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_MetBMI-\[Female/Male/BothSex\].tsv  
* Arivale baseline StandBMI predictions: 220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_StandBMI-\[Female/Male/BothSex\].tsv  

Output figures and tables:  
* Figure 1d  
* Supplementary Figure 3  
* Tables for Supplementary Data 3, 10  
* Intermediate tables for other notebooks (BMI predictions)  

Original notebook (memo for my future tracing):  
* dalek:\[JupyterLab HOME\]/220621_Multiomics-BMI-NatMedRevision/220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from decimal import Decimal, ROUND_HALF_UP
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score
import sys
from statsmodels.stats import weightstats
from statsmodels.stats import multitest as multi
from venn import venn
import matplotlib.lines as mlines

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

## 1. Data preparation for the Arivale cohort

> The following code is practically same with the one used in the LASSO modeling for the Arivale baseline datasets, only with the minor exceptions (e.g., “Base" in column names is removed in this notebook). Hence, the correspondence between participant and testing (hold-out) set for each LASSO model is maintained.  

### 1-1. Import the cleaned dataframes

In [None]:
#Import the cleaned baseline BMI dataframe
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_RF-imputation_'
fileName = 'baseline-combiDF-with-RF-imputation.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('public_client_id')
##Take BMI and general covariates (without Race in this study)
tempL = ['log_BaseBMI', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
tempDF = tempDF[tempL]

#Change column names
tempDF.columns = tempDF.columns.str.replace('Base', '')

display(tempDF)

bmiDF_a = tempDF

In [None]:
#Import the cleaned baseline omics dataframes
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_RF-imputation_'
tempL = ['log_BaseBMI', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Race']
tempD = {}
for df_n in ['metDF', 'chemDF']:
    fileName = 'baseline-'+df_n+'-with-RF-imputation.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('public_client_id')
    print(df_n+' original shape:', tempDF.shape)
    #Drop BMI and covariates
    tempDF = tempDF.drop(columns=tempL)
    display(tempDF)
    tempD[df_n] = tempDF

metDF_a = tempD['metDF']
chemDF_a = tempD['chemDF']

### 1-2. Stratification with sex

In [None]:
#Stratify the cohort with sex
bmiDF_a_F = bmiDF_a.loc[bmiDF_a['Sex']=='F']
bmiDF_a_M = bmiDF_a.loc[bmiDF_a['Sex']=='M']
bmiDF_a_B = bmiDF_a#Not copy just rename
print('Female, Male, Both sex = ', len(bmiDF_a_F), ', ', len(bmiDF_a_M), ', ', len(bmiDF_a_B))

### 1-3. Split the cohort into 10 sets

In [None]:
tempD1 = {'Female':bmiDF_a_F, 'Male':bmiDF_a_M, 'BothSex':bmiDF_a_B}
nmodels = 10
tempD2 = {}
for sex in tempD1.keys():
    tempDF = tempD1[sex]
    #Split cohort to define the training and testing (hold-out) sets
    tempL = np.array_split(tempDF, nmodels)#List of DFs
    tempD = {}
    for model_k in range(nmodels):
        tempDF1 = tempL[model_k]
        model_n = 'Model_'+str(model_k+1).zfill(2)
        tempS = pd.Series(np.repeat(model_n, len(tempDF1)),
                          index=tempDF1.index, name='Testing')
        tempD[model_k] = tempS
    tempS = pd.concat(list(tempD.values()), axis=0)
    #Add the info to bmiDF
    tempDF = pd.merge(tempDF, tempS, left_index=True, right_index=True, how='left')
    tempD2[sex] = tempDF
    print(sex)
    display(tempDF)
    display(tempDF['Testing'].value_counts())
    print('')
#Update
bmiDF_a_F = tempD2['Female']
bmiDF_a_M = tempD2['Male']
bmiDF_a_B = tempD2['BothSex']

### 1-4. Prepare dataframe for standard clinical measures

In [None]:
tempDF1 = bmiDF_a
tempDF2 = chemDF_a

#Select the independent variables
tempL = ['Sex', 'Age']
tempDF1 = tempDF1[tempL]
tempD = {'HDL CHOL DIRECT':'HDL-cholesterol',
         'LDL-CHOL CALCULATION':'LDL-cholesterol',
         'TRIGLYCERIDES':'Triglycerides',
         'GLUCOSE':'Glucose',
         'INSULIN':'Insulin',
         'HOMA-IR':'HOMA-IR'}
tempDF2 = tempDF2[list(tempD.keys())]
tempDF2 = tempDF2.rename(columns=tempD)
tempDF = pd.merge(tempDF1, tempDF2, left_index=True, right_index=True, how='inner')

display(tempDF.describe(include='all'))

standDF_a = tempDF

## 2. Data preparation for the TwinsUK cohort

> In this version, all the participants in the TwinsUK cohort belong to testing (external) set for all the LASSO models. Hence, only the sex stratification is needed.  

### 2-1. Import the cleaned dataframes

In [None]:
#Import the cleaned dataframes
tempD = {}
for dataset in ['general-data', 'serum-metabolomics', 'standard-measures']:
    if dataset!='serum-metabolomics':
        fileDir = './ExportData/'
        ipynbName = '220916_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataCleaning-ver3_'
        fileName = dataset+'_final.tsv'
    else:
        fileDir = './ImportData/'
        ipynbName = '220917_Multiomics-BMI-NatMed1stRevision_TwinsUK-RFimputation-wenceslaus-ver3_'
        fileName = dataset+'_final-with-RFimputation.tsv'
    if dataset=='general-data':
        tempL = ['VisitDate']
    else:
        tempL = []
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', parse_dates=tempL)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(dataset)
    display(tempDF)
    print(' - Unique KeyIndex:', len(tempDF.index.unique()))
    tempDF1 = tempDF.index.to_series().str.split(pat='_', expand=True)
    print(' - Unique participant:', len(tempDF1.iloc[:, 0].unique()))
    print('')
    
    tempD[dataset] = tempDF

bmiDF_t = tempD['general-data']
metDF_t = tempD['serum-metabolomics']
chemDF_t = tempD['standard-measures']

### 2-2. Stratification with sex

In [None]:
#Stratify the cohort with sex
bmiDF_t_F = bmiDF_t.loc[bmiDF_t['Sex']=='F']
bmiDF_t_M = bmiDF_t.loc[bmiDF_t['Sex']=='M']
bmiDF_t_B = bmiDF_t#Not copy just rename
print('Female, Male, Both sex = ', len(bmiDF_t_F), ', ', len(bmiDF_t_M), ', ', len(bmiDF_t_B))

### 2-3. Prepare dataframe for standard clinical measures

In [None]:
tempDF1 = bmiDF_t
tempDF2 = chemDF_t

#Select the independent variables
tempL = ['Sex', 'Age']
tempDF1 = tempDF1[tempL]
tempL = ['HDL-cholesterol', 'LDL-cholesterol', 'Triglycerides', 'Glucose', 'Insulin', 'HOMA-IR']
tempDF2 = tempDF2[tempL]
tempDF = pd.merge(tempDF1, tempDF2, left_index=True, right_index=True, how='inner')

display(tempDF.describe(include='all'))

standDF_t = tempDF

## 3. Metabolomics

### 3-1. Common metabolites in the Arivale and TwinsUK datasets

> In this version, the common metabolites in the Arivale and TwinsUK metabolomics datasets are retrieved simply by judging whether their biochemical names or chemical IDs are matched or not. Note that both datasets were generated by Metabolon.  

In [None]:
#Import the cleaned metabolomics metadata
fileDir = './ExportData/'
ipynbName = '220731_Multiomics-BMI-NatMed1stRevision_TwinsUK-DataFormatting_'
fileName = 'cleaned-metabolomics-metadata.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t')
tempDF = tempDF.set_index('MetabolonID')
print('Unique ID in TwinsUK:', len(tempDF.index.unique()))
print('Unique name in TwinsUK:', len(tempDF['MetaboliteName(TwinsUK)'].unique()))
print('Unique name in Arivale:', len(tempDF['MetaboliteName(Arivale)'].unique()))
display(tempDF)

metDF_meta = tempDF

In [None]:
tempDF1 = metDF_t.copy()
tempDF2 = metDF_meta

#Replace metabolite ID with its name in Arivale
tempL = []
for col_n in tempDF1.columns.tolist():
    metname_a = tempDF2.loc[col_n, 'MetaboliteName(Arivale)']
    if isinstance(metname_a, str):
        tempL.append(metname_a)
    else:#metname_a is NaN
        tempL.append(col_n)
tempDF1.columns = tempL
display(tempDF1)

#Update
metDF_t = tempDF1

In [None]:
tempDF1 = metDF_a
tempDF2 = metDF_t

#Identify the common metabolites
tempL1 = tempDF1.columns.tolist()
tempL2 = tempDF2.columns.tolist()
tempS = set(tempL1) & set(tempL2)
print('Arivale, TwinsUK, Common = ', len(tempL1), ', ', len(tempL2), ', ', len(tempS))

#Take the common metabolites
tempDF1 = tempDF1[tempS].sort_index(axis=1, ascending=True)
tempDF2 = tempDF2[tempS].sort_index(axis=1, ascending=True)

display(tempDF1)
display(tempDF2)

#Update
metDF_a = tempDF1
metDF_t = tempDF2

#### cf. Data structure: PCA

In [None]:
tempD1 = {'Arivale':metDF_a, 'TwinsUK':metDF_t}
tempD2 = {'Arivale':'tab:blue', 'TwinsUK':'tab:orange'}

#Prepare DF
tempDF = pd.concat(list(tempD1.values()), axis=0)
print('Combined original DF:')
display(tempDF)

#Z-score transformation
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
tempA = scaler.fit_transform(tempDF)#Column direction
tempDF = pd.DataFrame(data=tempA, index=tempDF.index, columns=tempDF.columns)
print('Z-score DF:')
display(tempDF.describe())

#Perform PCA
nPCs = 5
model = PCA(n_components=nPCs, svd_solver='randomized', iterated_power='auto', random_state=123)
model.fit(tempDF)

#Explained variance
tempS = pd.Series(data=model.explained_variance_ratio_*100,
                  index=['PC'+str(i+1) for i in range(nPCs)], name='ExplainedVariance')
print('Percentage of variance explained by each component:')
display(tempS)
##Scree plot
tempDF1 = tempS.reset_index()
tempDF1['PC'] = [i+1 for i in range(nPCs)]
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(4, 3))
p = sns.lineplot(data=tempDF1, x='PC', y='ExplainedVariance', color='k')
sns.despine()
p.set(xlim=(0.5, nPCs+0.5), xticks=np.arange(1, nPCs+0.1, 1))
plt.ylabel('Explained varaince [%]')
plt.xlabel('Principal component number')
plt.show()
##Integrate the explained variance into PC label
tempL = []
for i in range(nPCs):
    round_value = Decimal(str(tempS['PC'+str(i+1)])).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    tempL.append('PC'+str(i+1)+' ('+str(round_value)+'%)')

#Projection
tempDF1 = pd.DataFrame(data=model.transform(tempDF), index=tempDF.index, columns=tempL)
print('Projection DF:')
display(tempDF1)

#PC component
tempDF2 = pd.DataFrame(data=model.components_, index=tempL, columns=tempDF.columns)
print('PC component DF:')
display(tempDF2)

#Visualize sample distribution in the projected spaces
##Prepare sample metadata
tempL = []
for cohort in tempD1.keys():
    tempDF = tempD1[cohort]
    tempS = pd.Series(np.repeat(cohort, len(tempDF)), index=tempDF.index, name='Cohort')
    tempL.append(tempS)
tempS = pd.concat(tempL, axis=0)
##Add the sample metadata to projection DF
tempDF = pd.merge(tempDF1, tempS, left_index=True, right_index=True, how='left')
tempDF = tempDF.reset_index()
##Visualization
sns.set(style='ticks', font='Arial', context='talk')
p = sns.PairGrid(data=tempDF, hue='Cohort', hue_order=tempD2.keys(), palette=tempD2)
p.map_lower(sns.scatterplot, edgecolor='0.3', alpha=0.5, s=15)
for i, j in zip(*np.triu_indices_from(p.axes, 0)):
    p.axes[i, j].set_visible(False)
p.add_legend(bbox_to_anchor=(0.6, 0.6), loc='center right', frameon=True)
plt.show()

### 3-2. Standarization per dataset

> In this version, standardization, which affects the beta-coefficient units of final models, is performed per dataset. Note that, in addition to cohort difference, there were differences in samples themselves (e.g., plasma in Arivale but serum in TwinsUK).  

In [None]:
tempD1 = {'Arivale':metDF_a, 'TwinsUK':metDF_t}
tempD2 = {'Female':bmiDF_a_F, 'Male':bmiDF_a_M, 'BothSex':bmiDF_a_B}
tempD3 = {'Female':bmiDF_t_F, 'Male':bmiDF_t_M, 'BothSex':bmiDF_t_B}

#Standardization per dataset
tempD4 = {}
for cohort in tempD1.keys():
    tempDF = tempD1[cohort]
    for sex in tempD2.keys():
        #Prepare sex-stratified dataset
        if cohort=='Arivale':
            tempDF1 = tempD2[sex]
        elif cohort=='TwinsUK':
            tempDF1 = tempD3[sex]
        tempDF1 = tempDF.loc[tempDF1.index.tolist()]
        
        #Z-score transformation
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        tempA = scaler.fit_transform(tempDF1)#Column direction
        tempDF1 = pd.DataFrame(data=tempA, index=tempDF1.index, columns=tempDF1.columns)
        
        tempD4[cohort+' - '+sex] = tempDF1
        
        #Confirmation
        print(cohort+' - '+sex+':', tempDF1.shape)
        display(tempDF1.describe())
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 3))
        for col_i in range(3):
            sns.distplot(tempDF1.iloc[:, col_i], label=tempDF1.columns[col_i])
        sns.despine()
        plt.xlabel(r'$Z$'+'-score')
        plt.ylabel('Density')
        plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
        plt.show()
        print('')
    print('')

#Save
metDF_a_F = tempD4['Arivale - Female']
metDF_a_M = tempD4['Arivale - Male']
metDF_a_B = tempD4['Arivale - BothSex']
metDF_t_F = tempD4['TwinsUK - Female']
metDF_t_M = tempD4['TwinsUK - Male']
metDF_t_B = tempD4['TwinsUK - BothSex']

#### cf. Data structure: PCA

In [None]:
tempD1 = {'Arivale':metDF_a_B, 'TwinsUK':metDF_t_B}
tempD2 = {'Arivale':'tab:blue', 'TwinsUK':'tab:orange'}

#Prepare DF
tempDF = pd.concat(list(tempD1.values()), axis=0)
print('Combined original DF:')
display(tempDF)

#Z-score transformation
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
tempA = scaler.fit_transform(tempDF)#Column direction
tempDF = pd.DataFrame(data=tempA, index=tempDF.index, columns=tempDF.columns)
print('Z-score DF:')
display(tempDF.describe())

#Perform PCA
nPCs = 5
model = PCA(n_components=nPCs, svd_solver='randomized', iterated_power='auto', random_state=123)
model.fit(tempDF)

#Explained variance
tempS = pd.Series(data=model.explained_variance_ratio_*100,
                  index=['PC'+str(i+1) for i in range(nPCs)], name='ExplainedVariance')
print('Percentage of variance explained by each component:')
display(tempS)
##Scree plot
tempDF1 = tempS.reset_index()
tempDF1['PC'] = [i+1 for i in range(nPCs)]
sns.set(style='ticks', font='Arial', context='talk')
plt.figure(figsize=(4, 3))
p = sns.lineplot(data=tempDF1, x='PC', y='ExplainedVariance', color='k')
sns.despine()
p.set(xlim=(0.5, nPCs+0.5), xticks=np.arange(1, nPCs+0.1, 1))
plt.ylabel('Explained varaince [%]')
plt.xlabel('Principal component number')
plt.show()
##Integrate the explained variance into PC label
tempL = []
for i in range(nPCs):
    round_value = Decimal(str(tempS['PC'+str(i+1)])).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    tempL.append('PC'+str(i+1)+' ('+str(round_value)+'%)')

#Projection
tempDF1 = pd.DataFrame(data=model.transform(tempDF), index=tempDF.index, columns=tempL)
print('Projection DF:')
display(tempDF1)

#PC component
tempDF2 = pd.DataFrame(data=model.components_, index=tempL, columns=tempDF.columns)
print('PC component DF:')
display(tempDF2)

#Visualize sample distribution in the projected spaces
##Prepare sample metadata
tempL = []
for cohort in tempD1.keys():
    tempDF = tempD1[cohort]
    tempS = pd.Series(np.repeat(cohort, len(tempDF)), index=tempDF.index, name='Cohort')
    tempL.append(tempS)
tempS = pd.concat(tempL, axis=0)
##Add the sample metadata to projection DF
tempDF = pd.merge(tempDF1, tempS, left_index=True, right_index=True, how='left')
tempDF = tempDF.reset_index()
##Visualization
sns.set(style='ticks', font='Arial', context='talk')
p = sns.PairGrid(data=tempDF, hue='Cohort', hue_order=tempD2.keys(), palette=tempD2)
p.map_lower(sns.scatterplot, edgecolor='0.3', alpha=0.5, s=15)
for i, j in zip(*np.triu_indices_from(p.axes, 0)):
    p.axes[i, j].set_visible(False)
p.add_legend(bbox_to_anchor=(0.6, 0.6), loc='center right', frameon=True)
plt.show()

### 3-3. LASSO modeling with the Arivale dataset

> MetBMI models are re-generated from the Arivale (baseline) metabolomics dataset using the previous strategy (10-fold cross-validation) but restricting the input metabolites to the common metabolites.

#### 3-3-1. LASSO with cross-validation

In [None]:
#Female model
tempDF1 = metDF_a_F#Standardized independent variables
tempDF2 = bmiDF_a_F#Not-standardized dependent variable and info about testing set
yvar = 'BMI'
nmodels = len(tempDF2['Testing'].unique())#Reset just in case
ncvs = 10
model = LassoCV(eps=0.05, n_alphas=200, alphas=None, fit_intercept=True,
                normalize=False, precompute='auto', cv=ncvs)
yvar_model = 'MetBMI'

#Perform LASSO
bcoefDF = pd.DataFrame(index=tempDF1.columns).astype('float64')#For beta-coefficients
bcoefDF.index.rename('Variable', inplace=True)
interceptDF = pd.DataFrame(index=['Intercept']).astype('float64')#For intercept
interceptDF.index.rename('Variable', inplace=True)
scoreL = []#For the coefficient of determination R2
predictS = pd.Series(name='log_'+yvar_model).astype('float64')#For predictions
t_start = time.time()
for model_k in range(nmodels):
    #Prepare training and testing (hold-out) datasets in model k
    model_n = 'Model_'+str(model_k+1).zfill(2)
    yDF_train = tempDF2.loc[tempDF2['Testing']!=model_n]
    yDF_test = tempDF2.loc[tempDF2['Testing']==model_n]
    xDF_train = tempDF1.loc[yDF_train.index.tolist()]
    xDF_test = tempDF1.loc[yDF_test.index.tolist()]
    #Retrieve the dependent variable
    yDF_train = pd.DataFrame(yDF_train['log_'+yvar])#Not Series but DF
    yDF_test = pd.DataFrame(yDF_test['log_'+yvar])#Not Series but DF
    #Fitting model with cross-validation using training dataset (i.e., internal training/validation datasets)
    model.fit(xDF_train, yDF_train)
    #Check the penalization amount decided by cross validation
    print(model_n+': alpha =', model.alpha_)
    #Save parameters
    bcoefDF[model_n] = model.coef_#w in the cost function formula
    interceptDF[model_n] = model.intercept_#Independent term in decision function
    #Evaluation with testing (hold-out) dataset
    scoreL.append(model.score(xDF_test, yDF_test))
    #Prediction for testing dataset using the fitted model k
    tempS = pd.Series(model.predict(xDF_test),
                      index=xDF_test.index, name='log_'+yvar_model)
    predictS = pd.concat([predictS, tempS], axis=0)
t_elapsed = time.time() - t_start
print('Elapsed time for '+str(nmodels)+' models of '+str(ncvs)+'-fold CV LASSO:',
      round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')

#Clean prediction DF
tempDF = pd.merge(tempDF2[['Testing', 'log_'+yvar]], predictS,
                  left_index=True, right_index=True, how='left')
##Convert to original scale
tempDF[yvar] = np.e**tempDF['log_'+yvar]
tempDF[yvar_model] = np.e**tempDF['log_'+yvar_model]
display(tempDF)

#Save the cleaned prediction DF
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = yvar_model+'-Female-Arivale.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
metBMI_F_bcoefs = bcoefDF
metBMI_F_intercept = interceptDF
metBMI_a_F_R2 = scoreL
metBMI_a_F = tempDF

In [None]:
#Male model
tempDF1 = metDF_a_M#Standardized independent variables
tempDF2 = bmiDF_a_M#Not-standardized dependent variable and info about testing set
yvar = 'BMI'
nmodels = len(tempDF2['Testing'].unique())#Reset just in case
ncvs = 10
model = LassoCV(eps=0.05, n_alphas=200, alphas=None, fit_intercept=True,
                normalize=False, precompute='auto', cv=ncvs)
yvar_model = 'MetBMI'

#Perform LASSO
bcoefDF = pd.DataFrame(index=tempDF1.columns).astype('float64')#For beta-coefficients
bcoefDF.index.rename('Variable', inplace=True)
interceptDF = pd.DataFrame(index=['Intercept']).astype('float64')#For intercept
interceptDF.index.rename('Variable', inplace=True)
scoreL = []#For the coefficient of determination R2
predictS = pd.Series(name='log_'+yvar_model).astype('float64')#For predictions
t_start = time.time()
for model_k in range(nmodels):
    #Prepare training and testing (hold-out) datasets in model k
    model_n = 'Model_'+str(model_k+1).zfill(2)
    yDF_train = tempDF2.loc[tempDF2['Testing']!=model_n]
    yDF_test = tempDF2.loc[tempDF2['Testing']==model_n]
    xDF_train = tempDF1.loc[yDF_train.index.tolist()]
    xDF_test = tempDF1.loc[yDF_test.index.tolist()]
    #Retrieve the dependent variable
    yDF_train = pd.DataFrame(yDF_train['log_'+yvar])#Not Series but DF
    yDF_test = pd.DataFrame(yDF_test['log_'+yvar])#Not Series but DF
    #Fitting model with cross-validation using training dataset (i.e., internal training/validation datasets)
    model.fit(xDF_train, yDF_train)
    #Check the penalization amount decided by cross validation
    print(model_n+': alpha =', model.alpha_)
    #Save parameters
    bcoefDF[model_n] = model.coef_#w in the cost function formula
    interceptDF[model_n] = model.intercept_#Independent term in decision function
    #Evaluation with testing (hold-out) dataset
    scoreL.append(model.score(xDF_test, yDF_test))
    #Prediction for testing dataset using the fitted model k
    tempS = pd.Series(model.predict(xDF_test),
                      index=xDF_test.index, name='log_'+yvar_model)
    predictS = pd.concat([predictS, tempS], axis=0)
t_elapsed = time.time() - t_start
print('Elapsed time for '+str(nmodels)+' models of '+str(ncvs)+'-fold CV LASSO:',
      round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')

#Clean prediction DF
tempDF = pd.merge(tempDF2[['Testing', 'log_'+yvar]], predictS,
                  left_index=True, right_index=True, how='left')
##Convert to original scale
tempDF[yvar] = np.e**tempDF['log_'+yvar]
tempDF[yvar_model] = np.e**tempDF['log_'+yvar_model]
display(tempDF)

#Save the cleaned prediction DF
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = yvar_model+'-Male-Arivale.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
metBMI_M_bcoefs = bcoefDF
metBMI_M_intercept = interceptDF
metBMI_a_M_R2 = scoreL
metBMI_a_M = tempDF

In [None]:
#Both sex model
tempDF1 = metDF_a_B#Standardized independent variables
tempDF2 = bmiDF_a_B#Not-standardized dependent variable and info about testing set
yvar = 'BMI'
nmodels = len(tempDF2['Testing'].unique())#Reset just in case
ncvs = 10
model = LassoCV(eps=0.05, n_alphas=200, alphas=None, fit_intercept=True,
                normalize=False, precompute='auto', cv=ncvs)
yvar_model = 'MetBMI'

#Perform LASSO
bcoefDF = pd.DataFrame(index=tempDF1.columns).astype('float64')#For beta-coefficients
bcoefDF.index.rename('Variable', inplace=True)
interceptDF = pd.DataFrame(index=['Intercept']).astype('float64')#For intercept
interceptDF.index.rename('Variable', inplace=True)
scoreL = []#For the coefficient of determination R2
predictS = pd.Series(name='log_'+yvar_model).astype('float64')#For predictions
t_start = time.time()
for model_k in range(nmodels):
    #Prepare training and testing (hold-out) datasets in model k
    model_n = 'Model_'+str(model_k+1).zfill(2)
    yDF_train = tempDF2.loc[tempDF2['Testing']!=model_n]
    yDF_test = tempDF2.loc[tempDF2['Testing']==model_n]
    xDF_train = tempDF1.loc[yDF_train.index.tolist()]
    xDF_test = tempDF1.loc[yDF_test.index.tolist()]
    #Retrieve the dependent variable
    yDF_train = pd.DataFrame(yDF_train['log_'+yvar])#Not Series but DF
    yDF_test = pd.DataFrame(yDF_test['log_'+yvar])#Not Series but DF
    #Fitting model with cross-validation using training dataset (i.e., internal training/validation datasets)
    model.fit(xDF_train, yDF_train)
    #Check the penalization amount decided by cross validation
    print(model_n+': alpha =', model.alpha_)
    #Save parameters
    bcoefDF[model_n] = model.coef_#w in the cost function formula
    interceptDF[model_n] = model.intercept_#Independent term in decision function
    #Evaluation with testing (hold-out) dataset
    scoreL.append(model.score(xDF_test, yDF_test))
    #Prediction for testing dataset using the fitted model k
    tempS = pd.Series(model.predict(xDF_test),
                      index=xDF_test.index, name='log_'+yvar_model)
    predictS = pd.concat([predictS, tempS], axis=0)
t_elapsed = time.time() - t_start
print('Elapsed time for '+str(nmodels)+' models of '+str(ncvs)+'-fold CV LASSO:',
      round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')

#Clean prediction DF
tempDF = pd.merge(tempDF2[['Testing', 'log_'+yvar]], predictS,
                  left_index=True, right_index=True, how='left')
##Convert to original scale
tempDF[yvar] = np.e**tempDF['log_'+yvar]
tempDF[yvar_model] = np.e**tempDF['log_'+yvar_model]
display(tempDF)

#Save the cleaned prediction DF
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = yvar_model+'-BothSex-Arivale.tsv'
tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
metBMI_B_bcoefs = bcoefDF
metBMI_B_intercept = interceptDF
metBMI_a_B_R2 = scoreL
metBMI_a_B = tempDF

#### 3-3-2. Prediction accuracy

In [None]:
#Summary
tempD1 = {'Female':metBMI_a_F_R2, 'Male':metBMI_a_M_R2, 'Both sex':metBMI_a_B_R2}
tempD2 = {'Female':metBMI_a_F, 'Male':metBMI_a_M, 'Both sex':metBMI_a_B}
yvar = 'BMI'
yvar_model = 'MetBMI'

for sex in tempD1.keys():
    tempL = tempD1[sex]
    print(sex+' model')
    print(' - Out-of-sample R2 [Mean ± SD]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
    print(' - Out-of-sample R2 [Mean ± SEM]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
    tempDF = tempD2[sex]
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

> Check difference between sex-specific and sex-mixed models for now. Note that this is a rough comparison because the sample size is different.  

In [None]:
#Prepare DF
tempDF = pd.DataFrame({'Female':metBMI_a_F_R2, 'Male':metBMI_a_M_R2, 'Both sex':metBMI_a_B_R2})
tempDF = tempDF.melt(var_name='Cohort', value_name='R2', value_vars=tempDF.columns.tolist())

#Plot
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(3, 1))
sns.barplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', palette='Set1', dodge=False, edgecolor='black',
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black')
sns.stripplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', dodge=False, size=8, edgecolor='black',
              linewidth=1, alpha=0.4, palette={'Female':'gray', 'Male':'gray', 'Both sex':'gray'})
sns.despine()
plt.xlabel('Out-of-sample '+r'$R^2$'+' in 10 LASSO models\n[mean with 95% CI]')
plt.ylabel('')
plt.legend('', frameon=False)
plt.show()

In [None]:
tempD1 = {'Female':metBMI_a_F, 'Male':metBMI_a_M, 'Both sex':metBMI_a_B}
yvar = 'BMI'
yvar_model = 'MetBMI'
range_min = np.min([df[var].min() for df in tempD1.values() for var in [yvar, yvar_model]])
range_max = np.max([df[var].max() for df in tempD1.values() for var in [yvar, yvar_model]])
axis_label = 'BMI [kg m'+r'$^{-2}$'+']'

#Plot measured vs. predicted per model
tempD2 = {'Female':'tab:red', 'Male':'tab:blue', 'Both sex':'tab:green'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(10, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    tempDF = tempD1[sex]
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x=yvar_model, y=yvar, color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Measured '+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[yvar_model], tempDF[yvar])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==2:
        ax2_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax2_pos[0]+ax2_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Predicted '+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

#Plot difference b/w sex-specific and sex-mixed models
tempD2 = {'Female':'tab:red', 'Male':'tab:blue'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(6.5, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    #Prepare DF
    tempS1 = tempD1[sex][yvar_model]
    tempS1.name = 'Sex-specific'
    tempS2 = tempD1['Both sex'][yvar_model]
    tempS2.name = 'Sex-mixed'
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x='Sex-specific', y='Sex-mixed', color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Sex-mixed b'+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF['Sex-specific'], tempDF['Sex-mixed'])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==1:
        ax1_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax1_pos[0]+ax1_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Sex-specific b'+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

#### 3-3-3. Cleaning beta-coefficient dataframe

In [None]:
tempD1 = {'Female':metBMI_F_bcoefs, 'Male':metBMI_M_bcoefs, 'BothSex':metBMI_B_bcoefs}
tempD2 = {'Female':metBMI_F_intercept, 'Male':metBMI_M_intercept, 'BothSex':metBMI_B_intercept}
yvar_model = 'MetBMI'
model_method = 'LASSO'
for sex in tempD1.keys():
    #Combine variables and intercept
    tempDF1 = tempD1[sex]
    tempDF2 = tempD2[sex]
    tempDF = pd.concat([tempDF1, tempDF2], axis=0)
    #Summarize
    tempL1 = []
    tempL2 = []
    tempL3 = []
    for row_n in tempDF.index.tolist():
        tempL1.append(tempDF.loc[row_n].mean())
        tempL2.append(tempDF.loc[row_n].std(ddof=1))#Sample standard deviation
        tempL3.append((tempDF.loc[row_n]==0.0).astype('int64').sum())
    tempDF['Mean'] = tempL1
    tempDF['SD'] = tempL2
    tempDF['nZeros'] = tempL3
    #Save the cleaned DF
    fileDir = './ExportData/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = yvar_model+'-'+sex+'-'+model_method+'bcoefs.tsv'
    tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
    
    #Check
    tempDF2 = tempDF.loc[tempDF.index.isin(['Intercept'])]#Retrieve as pd.DataFrame
    tempDF = tempDF.drop(index=['Intercept'])
    print(sex+':')
    print(' - Variables:', len(tempDF))
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF1.sort_values(by='Mean', ascending=False)
    display(tempDF1)
    display(tempDF2)#Intercept
    print('')

### 3-4. Prediction for the TwinsUK cohort

> MetBMI is calculated for the TwinsUK cohort using the Arivale-fitted MetBMI models. Of note, the TwinsUK dataset corresponds to testing (external) set for all the fitted models. Also, note that beta-coefficients should NOT be averaged across all the 10 MetBMI models to calculate a single MetBMI value for each individual; instead, 10 MetBMI values are calculated for each individual, and then these values are averaged.  

#### 3-4-1. Calculate predictions using the fitted models

> According to LassoCV source, the self.predict(X) method calls self.\_decision_function(X) method, which further returns “safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_". In this case, safe_sparse_dot() simply corresponds to a dot product. Hence, manual calculation from beta-coefficients and intercept is impremented here.  

In [None]:
tempD1 = {'Female':metDF_t_F, 'Male':metDF_t_M, 'BothSex':metDF_t_B}
tempD2 = {'Female':metBMI_F_bcoefs, 'Male':metBMI_M_bcoefs, 'BothSex':metBMI_B_bcoefs}
tempD3 = {'Female':metBMI_F_intercept, 'Male':metBMI_M_intercept, 'BothSex':metBMI_B_intercept}
yvar_model = 'MetBMI'
tempD4 = {'Female':bmiDF_t_F, 'Male':bmiDF_t_M, 'BothSex':bmiDF_t_B}
yvar = 'BMI'

tempD = {}
for sex in tempD1.keys():
    #Prepare data DF
    tempDF1 = tempD1[sex]
    ##Add dummy intercept variable to data DF
    tempDF1['Intercept'] = 1.0
    
    #Prepare model DF
    tempDF2 = tempD2[sex]#Variables
    tempDF = tempD3[sex]#Intercept
    tempDF2 = pd.concat([tempDF2, tempDF], axis=0)
    
    #Check just in case
    tempA1 = tempDF1.columns.to_numpy()
    tempA2 = tempDF2.index.to_numpy()
    print(sex)
    print(' - nVariables is consistent between data and model DFs:',
          len(tempA1)==len(tempA2))
    print(' - Variable order is consistent between data and model DFs:',
          (tempA1==tempA2).sum()==len(tempA1))
    
    #Calculate prediction
    tempA = np.dot(tempDF1, tempDF2)
    tempDF = pd.DataFrame(tempA, index=tempDF1.index, columns=tempDF2.columns)
    
    #Calculate the mean of predicted values
    tempDF['log_'+yvar_model] = tempDF.mean(axis=1)
    
    #Convert to original scale
    tempDF[yvar_model] = np.e**tempDF['log_'+yvar_model]
    
    #Add true values
    tempDF1 = tempD4[sex]
    tempDF1 = tempDF1[['log_'+yvar, yvar]]
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='left')
    
    #Save the cleaned prediction DF
    fileDir = './ExportData/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = yvar_model+'-'+sex+'-TwinsUK.tsv'
    tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
    
    tempD[sex] = tempDF
    display(tempDF)
    print('')

metBMI_t_F = tempD['Female']
metBMI_t_M = tempD['Male']
metBMI_t_B = tempD['BothSex']

#### 3-4-2. Prediction accuracy

> According to LassoCV source, the self.score(X, y, sample_weight=None) method returns “r2_score(y, y_pred, sample_weight=sample_weight)". Hence, use sklearn.metrics.r2_score() here.  

In [None]:
tempD1 = {'Female':metBMI_t_F, 'Male':metBMI_t_M, 'BothSex':metBMI_t_B}
yvar = 'BMI'

#Calculate (out-of-sample) R2 per dataset
tempD2 = {}
for sex in tempD1.keys():
    tempDF = tempD1[sex]
    tempL = tempDF.loc[:, tempDF.columns.str.contains('Model_')].columns.tolist()
    tempL1 = []#For the coefficient of determination R^2
    for model_n in tempL:
        tempS1 = tempDF['log_'+yvar]
        tempS2 = tempDF[model_n]
        tempL1.append(r2_score(tempS1, tempS2, sample_weight=None))
    tempD2[sex] = tempL1
metBMI_t_F_R2 = tempD2['Female']
metBMI_t_M_R2 = tempD2['Male']
metBMI_t_B_R2 = tempD2['BothSex']

In [None]:
#Summary
tempD1 = {'Female':metBMI_t_F_R2, 'Male':metBMI_t_M_R2, 'Both sex':metBMI_t_B_R2}
tempD2 = {'Female':metBMI_t_F, 'Male':metBMI_t_M, 'Both sex':metBMI_t_B}
yvar = 'BMI'
yvar_model = 'MetBMI'

for sex in tempD1.keys():
    tempL = tempD1[sex]
    print(sex+' model')
    print(' - Out-of-sample R2 [Mean ± SD]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
    print(' - Out-of-sample R2 [Mean ± SEM]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
    tempDF = tempD2[sex]
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

> Check difference between sex-specific and sex-mixed models for now. Note that this is a rough comparison because the sample size is different.  

In [None]:
#Prepare DF
tempDF = pd.DataFrame({'Female':metBMI_t_F_R2, 'Male':metBMI_t_M_R2, 'Both sex':metBMI_t_B_R2})
tempDF = tempDF.melt(var_name='Cohort', value_name='R2', value_vars=tempDF.columns.tolist())

#Plot
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(3, 1))
sns.barplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', palette='Set1', dodge=False, edgecolor='black',
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black')
sns.stripplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', dodge=False, size=8, edgecolor='black',
              linewidth=1, alpha=0.4, palette={'Female':'gray', 'Male':'gray', 'Both sex':'gray'})
sns.despine()
plt.xlabel('Out-of-sample '+r'$R^2$'+' in 10 LASSO models\n[mean with 95% CI]')
plt.ylabel('')
plt.legend('', frameon=False)
plt.show()

In [None]:
tempD1 = {'Female':metBMI_t_F, 'Male':metBMI_t_M, 'Both sex':metBMI_t_B}
yvar = 'BMI'
yvar_model = 'MetBMI'
range_min = np.min([df[var].min() for df in tempD1.values() for var in [yvar, yvar_model]])
range_max = np.max([df[var].max() for df in tempD1.values() for var in [yvar, yvar_model]])
axis_label = 'BMI [kg m'+r'$^{-2}$'+']'

#Plot measured vs. predicted per model
tempD2 = {'Female':'tab:red', 'Male':'tab:blue', 'Both sex':'tab:green'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(10, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    tempDF = tempD1[sex]
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x=yvar_model, y=yvar, color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Measured '+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[yvar_model], tempDF[yvar])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==2:
        ax2_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax2_pos[0]+ax2_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Predicted '+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

#Plot difference b/w sex-specific and sex-mixed models
tempD2 = {'Female':'tab:red', 'Male':'tab:blue'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(6.5, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    #Prepare DF
    tempS1 = tempD1[sex][yvar_model]
    tempS1.name = 'Sex-specific'
    tempS2 = tempD1['Both sex'][yvar_model]
    tempS2.name = 'Sex-mixed'
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x='Sex-specific', y='Sex-mixed', color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Sex-mixed b'+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF['Sex-specific'], tempDF['Sex-mixed'])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==1:
        ax1_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax1_pos[0]+ax1_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Sex-specific b'+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

## 4. Standard clinical measures

> In the Arivale analysis, a model predicting BMI was generated using ordinary least squares (OLS) linear regression with sex, age, HDL-cholesterol, LDL-cholesterol, triglycerides (TG), glucose, insulin, and HOMA-IR. To directely compare with the LASSO models, regression analysis was applied to 10 split set.  

> As well as MetBMI, standard clinical measure-infered BMI (StandBMI) is calculated. Because the independent variables are consistent with the before, the prediction values for the Arivale cohort are completely same with the before. Hence, only the prediction for the TwinsUK cohort needs to be caluculated.  

### 4-1. Standardization

In [None]:
tempD1 = {'Arivale':standDF_a, 'TwinsUK':standDF_t}
tempD2 = {'Female':bmiDF_a_F, 'Male':bmiDF_a_M, 'BothSex':bmiDF_a_B}
tempD3 = {'Female':bmiDF_t_F, 'Male':bmiDF_t_M, 'BothSex':bmiDF_t_B}

#Standardization per dataset
tempD4 = {}
for cohort in tempD1.keys():
    tempDF = tempD1[cohort]
    for sex in tempD2.keys():
        #Prepare sex-stratified dataset
        if cohort=='Arivale':
            tempDF1 = tempD2[sex]
        elif cohort=='TwinsUK':
            tempDF1 = tempD3[sex]
        tempDF1 = tempDF.loc[tempDF1.index.tolist()]
        
        #Z-score transformation
        tempDF2 = tempDF1.select_dtypes(include=[np.number])
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        tempA = scaler.fit_transform(tempDF2)#Column direction
        tempDF2 = pd.DataFrame(data=tempA, index=tempDF2.index, columns=tempDF2.columns)
        ##Recover categorical covariates
        tempDF3 = tempDF1.select_dtypes(exclude=[np.number])
        tempDF1 = pd.merge(tempDF2, tempDF3, left_index=True, right_index=True, how='left')
        
        tempD4[cohort+' - '+sex] = tempDF1
        
        #Confirmation
        print(cohort+' - '+sex+':', tempDF1.shape)
        display(tempDF1.describe(include='all'))
        tempDF1 = tempDF1.select_dtypes(include=[np.number])
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 3))
        for col_i in range(len(tempDF1.columns)):
            sns.distplot(tempDF1.iloc[:, col_i], label=tempDF1.columns[col_i])
        sns.despine()
        plt.xlabel(r'$Z$'+'-score')
        plt.ylabel('Density')
        plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
        plt.show()
        print('')
    print('')

#Save
standDF_a_F = tempD4['Arivale - Female']
standDF_a_M = tempD4['Arivale - Male']
standDF_a_B = tempD4['Arivale - BothSex']
standDF_t_F = tempD4['TwinsUK - Female']
standDF_t_M = tempD4['TwinsUK - Male']
standDF_t_B = tempD4['TwinsUK - BothSex']

### 4-2. One-hot encoding for categorical variables

> While statsmodels automatically recognizes categorical variables, one-hot encoding is required in scikit-learn.  
> –> In this case, only sex is the categorical variable; hence, map manually! (cf. in many cases, category_encoders is more useful than sklearn.preprocessing or pandas.get_dummies.)  

In [None]:
tempD1 = {'Arivale - Female':standDF_a_F, 'Arivale - Male':standDF_a_M, 'Arivale - BothSex':standDF_a_B,
          'TwinsUK - Female':standDF_t_F, 'TwinsUK - Male':standDF_t_M, 'TwinsUK - BothSex':standDF_t_B}
tempD2 = {}
for dataset in tempD1.keys():
    tempDF = tempD1[dataset]
    #One-hot encoding for sex
    tempD = {'F':0, 'M':1}
    tempDF['Sex'] = tempDF['Sex'].map(tempD)
    tempD2[dataset] = tempDF
#Update
standDF_a_F = tempD2['Arivale - Female']
standDF_a_M = tempD2['Arivale - Male']
standDF_a_B = tempD2['Arivale - BothSex']
standDF_t_F = tempD2['TwinsUK - Female']
standDF_t_M = tempD2['TwinsUK - Male']
standDF_t_B = tempD2['TwinsUK - BothSex']

### 4-3. OLS linear regression with the Arivale dataset

#### 4-3-1. Import the previous models and predictions

In [None]:
yvar_model = 'StandBMI'
model_method = 'OLS'
tempD1 = {}
tempD2 = {}
for sex in ['Female', 'Male', 'BothSex']:
    print(sex+':')
    
    #Import the fitted models
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = yvar_model+'-'+sex+'-'+model_method+'bcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    ##Drop summary columns
    tempL = ['Mean', 'SD', 'nZeros']
    tempDF = tempDF.drop(columns=tempL)
    tempD1[sex] = tempDF
    ##Check
    print(' - Variables (without intercept):', len(tempDF)-1)
    display(tempDF)
    
    #Import the predictions
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = yvar_model+'-'+sex+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('public_client_id')
    ##Change column names
    tempDF.columns = tempDF.columns.str.replace('Base', '')
    tempD2[sex] = tempDF
    display(tempDF)

standBMI_F_model = tempD1['Female']
standBMI_M_model = tempD1['Male']
standBMI_B_model = tempD1['BothSex']
standBMI_a_F = tempD2['Female']
standBMI_a_M = tempD2['Male']
standBMI_a_B = tempD2['BothSex']

#### 4-3-2. Reproduce prediction accuracy

> According to LinearRegression source, the self.score(X, y, sample_weight=None) method returns “r2_score(y, y_pred, sample_weight=sample_weight)". Hence, use sklearn.metrics.r2_score() here.  

In [None]:
#Calculate (out-of-sample) R2
tempD1 = {'Female':standBMI_a_F, 'Male':standBMI_a_M, 'BothSex':standBMI_a_B}
yvar = 'BMI'
yvar_model = 'StandBMI'
tempD2 = {}
for sex in tempD1.keys():
    tempDF = tempD1[sex]
    tempL = tempDF['Testing'].unique().tolist()
    tempL1 = []#For the coefficient of determination R^2
    for model_n in tempL:
        tempS1 = tempDF['log_'+yvar].loc[tempDF['Testing']==model_n]
        tempS2 = tempDF['log_'+yvar_model].loc[tempDF['Testing']==model_n]
        tempL1.append(r2_score(tempS1, tempS2, sample_weight=None))
    tempD2[sex] = tempL1
standBMI_a_F_R2 = tempD2['Female']
standBMI_a_M_R2 = tempD2['Male']
standBMI_a_B_R2 = tempD2['BothSex']

In [None]:
#Summary
tempD1 = {'Female':standBMI_a_F_R2, 'Male':standBMI_a_M_R2, 'Both sex':standBMI_a_B_R2}
tempD2 = {'Female':standBMI_a_F, 'Male':standBMI_a_M, 'Both sex':standBMI_a_B}
yvar = 'BMI'
yvar_model = 'StandBMI'

for sex in tempD1.keys():
    tempL = tempD1[sex]
    print(sex+' model')
    print(' - Out-of-sample R2 [Mean ± SD]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
    print(' - Out-of-sample R2 [Mean ± SEM]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
    tempDF = tempD2[sex]
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

### 4-4. Prediction for the TwinsUK cohort

#### 4-4-1. Calculate predictions using the fitted models

> According to LinearRegression source, the self.predict(X) method calls self.\_decision_function(X) method, which further returns “safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_". In this case, safe_sparse_dot() simply corresponds to a dot product. Hence, manual calculation from beta-coefficients and intercept is impremented here.  

In [None]:
tempD1 = {'Female':standDF_t_F, 'Male':standDF_t_M, 'BothSex':standDF_t_B}
tempD2 = {'Female':standBMI_F_model, 'Male':standBMI_M_model, 'BothSex':standBMI_B_model}
yvar_model = 'StandBMI'
tempD4 = {'Female':bmiDF_t_F, 'Male':bmiDF_t_M, 'BothSex':bmiDF_t_B}
yvar = 'BMI'

tempD = {}
for sex in tempD1.keys():
    #Prepare data DF
    tempDF1 = tempD1[sex]
    ##Add dummy intercept variable to data DF
    tempDF1['Intercept'] = 1.0
    
    #Prepare model DF
    tempDF2 = tempD2[sex]#Variables + Intercept
    
    #Check just in case
    tempA1 = tempDF1.columns.to_numpy()
    tempA2 = tempDF2.index.to_numpy()
    print(sex)
    print(' - nVariables is consistent between data and model DFs:',
          len(tempA1)==len(tempA2))
    print(' - Variable order is consistent between data and model DFs:',
          (tempA1==tempA2).sum()==len(tempA1))
    
    #Calculate prediction
    tempA = np.dot(tempDF1, tempDF2)
    tempDF = pd.DataFrame(tempA, index=tempDF1.index, columns=tempDF2.columns)
    
    #Calculate the mean of predicted values
    tempDF['log_'+yvar_model] = tempDF.mean(axis=1)
    
    #Convert to original scale
    tempDF[yvar_model] = np.e**tempDF['log_'+yvar_model]
    
    #Add true values
    tempDF1 = tempD4[sex]
    tempDF1 = tempDF1[['log_'+yvar, yvar]]
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='left')
    
    #Save the cleaned prediction DF
    fileDir = './ExportData/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = yvar_model+'-'+sex+'-TwinsUK.tsv'
    tempDF.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)
    
    tempD[sex] = tempDF
    display(tempDF)
    print('')

standBMI_t_F = tempD['Female']
standBMI_t_M = tempD['Male']
standBMI_t_B = tempD['BothSex']

#### 4-4-2. Prediction accuracy

> According to LinearRegression source, the self.score(X, y, sample_weight=None) method returns “r2_score(y, y_pred, sample_weight=sample_weight)". Hence, use sklearn.metrics.r2_score() here.  

In [None]:
tempD1 = {'Female':standBMI_t_F, 'Male':standBMI_t_M, 'BothSex':standBMI_t_B}
yvar = 'BMI'

#Calculate (out-of-sample) R2 per dataset
tempD2 = {}
for sex in tempD1.keys():
    tempDF = tempD1[sex]
    tempL = tempDF.loc[:, tempDF.columns.str.contains('Model_')].columns.tolist()
    tempL1 = []#For the coefficient of determination R^2
    for model_n in tempL:
        tempS1 = tempDF['log_'+yvar]
        tempS2 = tempDF[model_n]
        tempL1.append(r2_score(tempS1, tempS2, sample_weight=None))
    tempD2[sex] = tempL1
standBMI_t_F_R2 = tempD2['Female']
standBMI_t_M_R2 = tempD2['Male']
standBMI_t_B_R2 = tempD2['BothSex']

In [None]:
#Summary
tempD1 = {'Female':standBMI_t_F_R2, 'Male':standBMI_t_M_R2, 'Both sex':standBMI_t_B_R2}
tempD2 = {'Female':standBMI_t_F, 'Male':standBMI_t_M, 'Both sex':standBMI_t_B}
yvar = 'BMI'
yvar_model = 'StandBMI'

for sex in tempD1.keys():
    tempL = tempD1[sex]
    print(sex+' model')
    print(' - Out-of-sample R2 [Mean ± SD]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
    print(' - Out-of-sample R2 [Mean ± SEM]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
    tempDF = tempD2[sex]
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

> Check difference between sex-specific and sex-mixed models for now. Note that this is a rough comparison because the sample size is different.  

In [None]:
#Prepare DF
tempDF = pd.DataFrame({'Female':standBMI_t_F_R2, 'Male':standBMI_t_M_R2, 'Both sex':standBMI_t_B_R2})
tempDF = tempDF.melt(var_name='Cohort', value_name='R2', value_vars=tempDF.columns.tolist())

#Plot
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(3, 1))
sns.barplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', palette='Set1', dodge=False, edgecolor='black',
            ci=95, capsize=0.4, errwidth=1.5, errcolor='black')
sns.stripplot(data=tempDF, y='Cohort', x='R2', hue='Cohort', dodge=False, size=8, edgecolor='black',
              linewidth=1, alpha=0.4, palette={'Female':'gray', 'Male':'gray', 'Both sex':'gray'})
sns.despine()
plt.xlabel('Out-of-sample '+r'$R^2$'+' in 10 OLS-LR models\n[mean with 95% CI]')
plt.ylabel('')
plt.legend('', frameon=False)
plt.show()

In [None]:
tempD1 = {'Female':standBMI_t_F, 'Male':standBMI_t_M, 'Both sex':standBMI_t_B}
yvar = 'BMI'
yvar_model = 'StandBMI'
range_min = np.min([df[var].min() for df in tempD1.values() for var in [yvar, yvar_model]])
range_max = np.max([df[var].max() for df in tempD1.values() for var in [yvar, yvar_model]])
axis_label = 'BMI [kg m'+r'$^{-2}$'+']'

#Plot measured vs. predicted per model
tempD2 = {'Female':'tab:red', 'Male':'tab:blue', 'Both sex':'tab:green'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(10, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    tempDF = tempD1[sex]
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x=yvar_model, y=yvar, color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Measured '+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[yvar_model], tempDF[yvar])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==2:
        ax2_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax2_pos[0]+ax2_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Predicted '+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

#Plot difference b/w sex-specific and sex-mixed models
tempD2 = {'Female':'tab:red', 'Male':'tab:blue'}
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD2), figsize=(6.5, 3), sharex=True, sharey=True,
                         gridspec_kw={'hspace':0.1, 'wspace':0.1})
for ax_i, ax in enumerate(axes.flat):
    sex = list(tempD2.keys())[ax_i]
    #Prepare DF
    tempS1 = tempD1[sex][yvar_model]
    tempS1.name = 'Sex-specific'
    tempS2 = tempD1['Both sex'][yvar_model]
    tempS2.name = 'Sex-mixed'
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    #Y=X as reference
    ax.plot([range_min, range_max], [range_min, range_max], color='black', linestyle=(0, (1, 2)))
    #Regplot
    sns.regplot(data=tempDF, x='Sex-specific', y='Sex-mixed', color=tempD2[sex],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':30}, ax=ax)
    if ax_i%len(tempD2)==0:
        plt.setp(ax, xlabel='', ylabel='Sex-mixed b'+axis_label)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF['Sex-specific'], tempDF['Sex-mixed'])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='x-small', color='k')
    ##Facet label
    ax.set_title(sex, {'fontsize':'medium'})
    #Save position to generate facet and legend later
    if ax_i ==0:
        ax0_pos = ax.get_position().bounds
    elif ax_i==1:
        ax1_pos = ax.get_position().bounds
sns.despine()
fig.text(x=(ax0_pos[0]+(ax1_pos[0]+ax1_pos[2]))/2, y=ax0_pos[1]-ax0_pos[3]*0.2,#Minor manual adjustment
         s='Sex-specific b'+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center', rotation='horizontal')
plt.show()

## 5. Validation

> Performance for the TwinsUK predictions can be a main validation index. Also, the retained variables in MetBMI models should be compared between the previous full version and this restricted version.  

### 5-1. Out-of-sample R2

In [None]:
tempD1 = {'Standard measures':standBMI_a_B_R2, 'Metabolomics':metBMI_a_B_R2}
tempD2 = {'Standard measures':standBMI_t_B_R2, 'Metabolomics':metBMI_t_B_R2}
tempD1 = {'Arivale':tempD1, 'TwinsUK':tempD2}
tempD2 = {'Standard measures':'0.5', 'Metabolomics':'b'}
tempD3 = {'Arivale':plt.get_cmap('tab20')(1), 'TwinsUK':plt.get_cmap('tab20')(3)}

#Prepare DFs
tempD4 = {}
tempD = {}
for cohort in tempD1.keys():
    #Prepare DF
    tempDF = pd.DataFrame(tempD1[cohort])
    tempD4[cohort] = tempDF
    print(cohort)
    display(tempDF.describe())
    
    #Statistical tests
    control = list(tempD2.keys())[0]
    tempL = list(tempD2.keys())[1:]
    tempDF1 = pd.DataFrame(columns=['Control', 'Contrast', 'Control_N', 'contrast_N', 'DoF', 'tStat', 'Pval'])
    for contrast in tempL:
        tempS1 = tempDF[control]
        tempS2 = tempDF[contrast]
        #Two-sided Welch's t-test
        tstat, pval, dof = weightstats.ttest_ind(tempS1, tempS2,
                                                 alternative='two-sided', usevar='unequal')
        size1 = len(tempS1)
        size2 = len(tempS2)
        tempDF1.loc[contrast+'-vs-'+control] = [control, contrast, size1, size2, dof, tstat, pval]
    ##P-value adjustment (within cohort) by using Benjamini–Hochberg method
    tempDF1['AdjPval_cohort'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                                    is_sorted=False, returnsorted=False)[1]
    tempDF1['Cohort'] = cohort
    tempD[cohort] = tempDF1
tempDF1 = pd.concat(list(tempD.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF1['AdjPval_all'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('ComparisonLabel', inplace=True)
tempDF1 = tempDF1.reset_index().set_index(['Cohort', 'ComparisonLabel'])
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-BothSex.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
axis_ymin = -0.175
axis_ymax = 0.925
ymin = 0.0
ymax = 0.8
yinter = 0.2
aline_ymin = 0.8
aline_yinter = 0.1
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD4),
                         figsize=(1.5*len(tempD4), 4), sharex=True, sharey=True)
for ax_i, ax in enumerate(axes.flat):
    cohort = list(tempD4.keys())[ax_i]
    tempDF = tempD4[cohort]
    tempDF = tempDF.melt(var_name='Category', value_name='R2', value_vars=tempDF.columns.tolist())
    sns.barplot(data=tempDF, y='R2', x='Category', order=tempD2.keys(),
                hue='Category', hue_order=tempD2.keys(), dodge=False, palette=tempD2,
                ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black', ax=ax)
    sns.stripplot(data=tempDF, y='R2', x='Category', order=tempD2.keys(),
                  hue='Category', hue_order=tempD2.keys(), dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4, ax=ax)
    #P-value annotation
    tempDF2 = tempDF1.loc[cohort]#MultiIndex
    for row_i in range(len(tempDF2)):
        #Control
        group_0 = tempDF2['Control'].iloc[row_i]
        xcoord_0 = list(tempD2.keys()).index(group_0)
        #Contrast
        group_1 = tempDF2['Contrast'].iloc[row_i]
        xcoord_1 = list(tempD2.keys()).index(group_1)
        #Standard point of marker
        xcoord = (xcoord_0+xcoord_1)/2
        ycoord = aline_ymin + aline_yinter*row_i
        #Add annotation lines
        aline_offset = yinter/10
        aline_length = yinter/10 + aline_offset
        ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                lw=1.5, c='k')
        #Retrieve P-value
        pval = tempDF2['AdjPval_all'].iloc[row_i]
        if pval<0.001:
            label = '***'
        elif pval<0.01:
            label = '**'
        elif pval<0.05:
            label = '*'
        else:
            pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
            label = r'$P$'+' = '+pval_text
        #Add annotation text
        if label in ['***', '**', '*']:
            text_offset = yinter/25
            text_size = 'medium'
        else:
            text_offset = yinter/5
            text_size = 'x-small'
        ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                    horizontalalignment='center', verticalalignment='bottom',
                    fontsize=text_size, color='k')
    #Facet label
    ax.set_title(cohort, {'fontsize':'medium'})
    xoff = 0.025
    yoff = 0.01
    rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.1,#Manual adjustment
                         transform=ax.transAxes, facecolor=tempD3[cohort],
                         clip_on=False, linewidth=0, zorder=0.5)
    ax.add_patch(rect)
    #Legend
    ax.get_legend().remove()
    #Axis setting
    plt.setp(ax.get_xticklabels(), rotation=70,
             horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
    if ax_i==0:
        plt.setp(ax, xlabel='', ylabel='Out-of-sample '+r'$R^2$')
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
sns.despine()
plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
##Save
fileDir = './ExportFigures/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-BothSex.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

In [None]:
tempD1 = {'Standard measures':standBMI_a_F_R2, 'Metabolomics':metBMI_a_F_R2}
tempD2 = {'Standard measures':standBMI_t_F_R2, 'Metabolomics':metBMI_t_F_R2}
tempD3 = {'Arivale':tempD1, 'TwinsUK':tempD2}
tempD1 = {'Standard measures':standBMI_a_M_R2, 'Metabolomics':metBMI_a_M_R2}
tempD2 = {'Standard measures':standBMI_t_M_R2, 'Metabolomics':metBMI_t_M_R2}
tempD4 = {'Arivale':tempD1, 'TwinsUK':tempD2}
tempD1 = {'Female':tempD3, 'Male':tempD4}
tempD2 = {'Standard measures':'0.5', 'Metabolomics':'b'}
tempD3 = {'Arivale':plt.get_cmap('tab20')(1), 'TwinsUK':plt.get_cmap('tab20')(3)}

#Prepare DFs
tempD4 = {}
tempD5 = {}
for sex in tempD1.keys():
    tempD = tempD1[sex]
    tempD6 = {}
    for cohort in tempD.keys():
        #Prepare DF
        tempDF = pd.DataFrame(tempD[cohort])
        tempD4[sex+' model: '+cohort] = tempDF
        print(sex+' model: '+cohort)
        display(tempDF.describe())
        
        #Statistical tests
        control = list(tempD2.keys())[0]
        tempL = list(tempD2.keys())[1:]
        tempDF1 = pd.DataFrame(columns=['Control', 'Contrast', 'Control_N', 'contrast_N', 'DoF', 'tStat', 'Pval'])
        for contrast in tempL:
            tempS1 = tempDF[control]
            tempS2 = tempDF[contrast]
            #Two-sided Welch's t-test
            tstat, pval, dof = weightstats.ttest_ind(tempS1, tempS2,
                                                     alternative='two-sided', usevar='unequal')
            size1 = len(tempS1)
            size2 = len(tempS2)
            tempDF1.loc[contrast+'-vs-'+control] = [control, contrast, size1, size2, dof, tstat, pval]
        ##P-value adjustment (within sex and cohort) by using Benjamini–Hochberg method
        tempDF1['AdjPval_sex-cohort'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                                            is_sorted=False, returnsorted=False)[1]
        tempDF1['Cohort'] = cohort
        tempD6[cohort] = tempDF1
    tempDF1 = pd.concat(list(tempD6.values()), axis=0)
    ##P-value adjustment (within sex) by using Benjamini–Hochberg method
    tempDF1['AdjPval_sex'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                                 is_sorted=False, returnsorted=False)[1]
    tempDF1['Sex'] = sex
    tempD5[sex] = tempDF1
tempDF1 = pd.concat(list(tempD5.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF1['AdjPval_all'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('ComparisonLabel', inplace=True)
tempDF1 = tempDF1.reset_index().set_index(['Sex', 'Cohort', 'ComparisonLabel'])
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-FemaleMale.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
axis_ymin = -0.175
axis_ymax = 0.925
ymin = 0.0
ymax = 0.8
yinter = 0.2
aline_ymin = 0.8
aline_yinter = 0.1
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD4),
                         figsize=(1.5*len(tempD4), 4), sharex=True, sharey=True)
for ax_i, ax in enumerate(axes.flat):
    label = list(tempD4.keys())[ax_i]
    tempDF = tempD4[label]
    tempDF = tempDF.melt(var_name='Category', value_name='R2', value_vars=tempDF.columns.tolist())
    sns.barplot(data=tempDF, y='R2', x='Category', order=tempD2.keys(),
                hue='Category', hue_order=tempD2.keys(), dodge=False, palette=tempD2,
                ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black', ax=ax)
    sns.stripplot(data=tempDF, y='R2', x='Category', order=tempD2.keys(),
                  hue='Category', hue_order=tempD2.keys(), dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4, ax=ax)
    #P-value annotation
    sex, cohort = label.split(' model: ')
    tempDF2 = tempDF1.loc[(sex, cohort)]#MultiIndex
    for row_i in range(len(tempDF2)):
        #Control
        group_0 = tempDF2['Control'].iloc[row_i]
        xcoord_0 = list(tempD2.keys()).index(group_0)
        #Contrast
        group_1 = tempDF2['Contrast'].iloc[row_i]
        xcoord_1 = list(tempD2.keys()).index(group_1)
        #Standard point of marker
        xcoord = (xcoord_0+xcoord_1)/2
        ycoord = aline_ymin + aline_yinter*row_i
        #Add annotation lines
        aline_offset = yinter/10
        aline_length = yinter/10 + aline_offset
        ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                lw=1.5, c='k')
        #Retrieve P-value
        pval = tempDF2['AdjPval_all'].iloc[row_i]
        if pval<0.001:
            label = '***'
        elif pval<0.01:
            label = '**'
        elif pval<0.05:
            label = '*'
        else:
            pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
            label = r'$P$'+' = '+pval_text
        #Add annotation text
        if label in ['***', '**', '*']:
            text_offset = yinter/25
            text_size = 'medium'
        else:
            text_offset = yinter/5
            text_size = 'x-small'
        ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                    horizontalalignment='center', verticalalignment='bottom',
                    fontsize=text_size, color='k')
    #Facet label
    ax.set_title(cohort, {'fontsize':'medium'})
    xoff = 0.025
    yoff = 0.01
    rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.1,#Manual adjustment
                         transform=ax.transAxes, facecolor=tempD3[cohort],
                         clip_on=False, linewidth=0, zorder=0.5)
    ax.add_patch(rect)
    #Legend
    ax.get_legend().remove()
    #Axis setting
    plt.setp(ax.get_xticklabels(), rotation=70,
             horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
    if ax_i==0:
        plt.setp(ax, xlabel='', ylabel='Out-of-sample '+r'$R^2$')
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
sns.despine()
plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
##Additional facet label
tempL = []
for ax in axes.flat:
    tempL.append(ax.get_position().bounds)
for sex_i, sex in enumerate(tempD1.keys()):
    tempL1 = tempL[sex_i*len(tempD3)]
    tempL2 = tempL[(sex_i+1)*len(tempD3)-1]
    fig.text(x=(tempL1[0]+(tempL2[0]+tempL2[2]))/2,
             y=tempL1[1]+tempL1[3]*1.11,#Minor manual adjustment
             s=sex, fontsize='large', verticalalignment='bottom', horizontalalignment='center')
##Save
fileDir = './ExportFigures/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-FemaleMale.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

#### cf. Add full version

In [None]:
#Reproduce the previous R2 scores of full version models
yvar_model = 'MetBMI'
yvar = 'BMI'
tempD = {}
for sex in ['BothSex']:
    #Import the predictions
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = yvar_model+'-'+sex+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('public_client_id')
    ##Change column names
    tempDF.columns = tempDF.columns.str.replace('Base', '')
    
    #Calculate (out-of-sample) R2
    tempL = tempDF['Testing'].unique().tolist()
    tempL1 = []#For the coefficient of determination R^2
    for model_n in tempL:
        tempS1 = tempDF['log_'+yvar].loc[tempDF['Testing']==model_n]
        tempS2 = tempDF['log_'+yvar_model].loc[tempDF['Testing']==model_n]
        tempL1.append(r2_score(tempS1, tempS2, sample_weight=None))
    tempD[sex] = tempL1
    
    #Check
    print(sex+' model')
    tempL = tempL1#Just rename
    print(' - Out-of-sample R2 [Mean ± SD]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1))#Sample standard deviation
    print(' - Out-of-sample R2 [Mean ± SEM]:',
          np.mean(tempL), '±', np.std(tempL, ddof=1)/np.sqrt(len(tempL)))
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

metBMI_a_B_R2_full = tempD['BothSex']

In [None]:
tempD1 = {'Standard measures':standBMI_a_B_R2,
          'Metabolomics (full)':metBMI_a_B_R2_full,
          'Metabolomics (restricted)':metBMI_a_B_R2}
tempD2 = {'Standard measures':standBMI_t_B_R2, 'Metabolomics (restricted)':metBMI_t_B_R2}
tempD1 = {'Arivale':tempD1, 'TwinsUK':tempD2}
tempD2 = {'Standard measures':'0.5',
          'Metabolomics (full)':'b',
          'Metabolomics (restricted)':'b'}
tempD3 = {'Standard measures':'0.5', 'Metabolomics (restricted)':'b'}
tempD2 = {'Arivale':tempD2, 'TwinsUK':tempD3}
tempD3 = {'Arivale':plt.get_cmap('tab20')(1), 'TwinsUK':plt.get_cmap('tab20')(3)}

#Prepare DFs
tempD4 = {}
tempD5 = {}
for cohort in tempD1.keys():
    #Prepare DF
    tempDF = pd.DataFrame(tempD1[cohort])
    tempD4[cohort] = tempDF
    print(cohort)
    display(tempDF.describe())
    
    #Statistical tests (modified due to irregular order)
    tempD = tempD2[cohort]
    contrast = list(tempD.keys())[-1]
    tempDF1 = pd.DataFrame(columns=['Control', 'Contrast', 'Control_N', 'contrast_N', 'DoF', 'tStat', 'Pval'])
    for control in tempD.keys():
        if control!=contrast:
            tempS1 = tempDF[control]
            tempS2 = tempDF[contrast]
            #Two-sided Welch's t-test
            tstat, pval, dof = weightstats.ttest_ind(tempS1, tempS2,
                                                     alternative='two-sided', usevar='unequal')
            size1 = len(tempS1)
            size2 = len(tempS2)
            tempDF1.loc[contrast+'-vs-'+control] = [control, contrast, size1, size2, dof, tstat, pval]
    ##P-value adjustment (within cohort) by using Benjamini–Hochberg method
    tempDF1['AdjPval_cohort'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                                    is_sorted=False, returnsorted=False)[1]
    tempDF1['Cohort'] = cohort
    tempD5[cohort] = tempDF1
tempDF1 = pd.concat(list(tempD5.values()), axis=0)
##P-value adjustment (across all tests) by using Benjamini–Hochberg method
tempDF1['AdjPval_all'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                             is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('ComparisonLabel', inplace=True)
tempDF1 = tempDF1.reset_index().set_index(['Cohort', 'ComparisonLabel'])
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-BothSex_with-full-ver.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot (modified due to irregular order)
axis_ymin = -0.175
axis_ymax = 1.025
ymin = 0.0
ymax = 0.8
yinter = 0.2
aline_ymin = 0.8
aline_yinter = 0.1
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD4),
                         figsize=(1.25+1.5+1, 4), sharex=False, sharey=True,
                         gridspec_kw={'width_ratios':[3, 2]})
for ax_i, ax in enumerate(axes.flat):
    cohort = list(tempD4.keys())[ax_i]
    tempDF = tempD4[cohort]
    tempDF = tempDF.melt(var_name='Category', value_name='R2', value_vars=tempDF.columns.tolist())
    tempD = tempD2[cohort]
    sns.barplot(data=tempDF, y='R2', x='Category', order=tempD.keys(),
                hue='Category', hue_order=tempD.keys(), dodge=False, palette=tempD,
                ci=95, capsize=0.4, errwidth=1.5, errcolor='black', edgecolor='black', ax=ax)
    sns.stripplot(data=tempDF, y='R2', x='Category', order=tempD.keys(),
                  hue='Category', hue_order=tempD.keys(), dodge=False, jitter=0.3,
                  size=5, edgecolor='black', color='gray', linewidth=1, alpha=0.4, ax=ax)
    #P-value annotation
    tempDF2 = tempDF1.loc[cohort]#MultiIndex
    tempDF2 = tempDF2.iloc[::-1]#Inverse order
    for row_i in range(len(tempDF2)):
        #Control
        group_0 = tempDF2['Control'].iloc[row_i]
        xcoord_0 = list(tempD.keys()).index(group_0)
        #Contrast
        group_1 = tempDF2['Contrast'].iloc[row_i]
        xcoord_1 = list(tempD.keys()).index(group_1)
        #Standard point of marker
        xcoord = (xcoord_0+xcoord_1)/2
        ycoord = aline_ymin + aline_yinter*row_i
        #Add annotation lines
        aline_offset = yinter/10
        aline_length = yinter/10 + aline_offset
        ax.plot([xcoord_0, xcoord_0, xcoord_1, xcoord_1],
                [ycoord+aline_offset, ycoord+aline_length, ycoord+aline_length, ycoord+aline_offset],
                lw=1.5, c='k')
        #Retrieve P-value
        pval = tempDF2['AdjPval_all'].iloc[row_i]
        if pval<0.001:
            label = '***'
        elif pval<0.01:
            label = '**'
        elif pval<0.05:
            label = '*'
        else:
            pval_text = str(Decimal(pval).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
            label = r'$P$'+' = '+pval_text
        #Add annotation text
        if label in ['***', '**', '*']:
            text_offset = yinter/25
            text_size = 'medium'
        else:
            text_offset = yinter/5
            text_size = 'x-small'
        ax.annotate(label, xy=(xcoord, ycoord+text_offset),
                    horizontalalignment='center', verticalalignment='bottom',
                    fontsize=text_size, color='k')
    #Facet label
    ax.set_title(cohort, {'fontsize':'medium'})
    xoff = 0.025
    yoff = 0.01
    rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.1,#Manual adjustment
                         transform=ax.transAxes, facecolor=tempD3[cohort],
                         clip_on=False, linewidth=0, zorder=0.5)
    ax.add_patch(rect)
    #Legend
    ax.get_legend().remove()
    #Axis setting
    plt.setp(ax.get_xticklabels(), rotation=70,
             horizontalalignment='right', verticalalignment='center', rotation_mode='anchor')
    if ax_i==0:
        plt.setp(ax, xlabel='', ylabel='Out-of-sample '+r'$R^2$')
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
sns.despine()
plt.setp(axes, ylim=(axis_ymin, axis_ymax), yticks=np.arange(ymin, ymax+yinter/10, yinter))
##Save
fileDir = './ExportFigures/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'R2-comparison-BothSex_with-full-ver.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 5-2. Measured vs. Predicted

> In this version, P-values from Pearson's correlation tests are adjusted across all categories and cohorts.  

In [None]:
tempD1 = {'Arivale: Standard measures':standBMI_a_B,
          'Arivale: Metabolomics':metBMI_a_B,
          'TwinsUK: Standard measures':standBMI_t_B,
          'TwinsUK: Metabolomics':metBMI_t_B}
tempD2 = {'Arivale: Standard measures':'StandBMI',
          'Arivale: Metabolomics':'MetBMI',
          'TwinsUK: Standard measures':'StandBMI',
          'TwinsUK: Metabolomics':'MetBMI'}
tempD3 = {'Arivale: Standard measures':'0.5',
          'Arivale: Metabolomics':'b',
          'TwinsUK: Standard measures':'0.5',
          'TwinsUK: Metabolomics':'b'}
tempD4 = {'Arivale: Standard measures':plt.get_cmap('tab20')(1),
          'Arivale: Metabolomics':plt.get_cmap('tab20')(1),
          'TwinsUK: Standard measures':plt.get_cmap('tab20')(3),
          'TwinsUK: Metabolomics':plt.get_cmap('tab20')(3)}
yvar='BMI'
axis_label = 'BMI [kg m'+r'$^{-2}$'+']'

#Statistical tests
tempDF1 = pd.DataFrame(columns=['N', 'DoF', 'Pearson_r', 'Pval'])
for category in tempD1.keys():
    tempDF = tempD1[category]
    xvar = tempD2[category]
    #Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[xvar], tempDF[yvar])
    size = len(tempDF)
    dof = size - 2
    tempDF1.loc[category] = [size, dof, pearson_r, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF1['AdjPval'] = multi.multipletests(tempDF1['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF1.index.rename('Category', inplace=True)
tempDF1['N'] = tempDF1['N'].astype('int64')#Otherwise, float64!
tempDF1['DoF'] = tempDF1['DoF'].astype('int64')#Otherwise, float64!
display(tempDF1)
##Save
fileDir = './ExportData/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'regplot-comparison-BothSex.tsv'
tempDF1.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

#Plot
sns.set(style='ticks', font='Arial', context='talk')
fig, axes = plt.subplots(nrows=1, ncols=len(tempD3),
                         figsize=(3.5*len(tempD3), 3.5+1), sharex=True, sharey=True)
axis_xymin = 12.5
axis_xymax = 102.5
xymin = 15
xymax = 90
xyinter = 15
#Set axis range first; otherwise, regression line can be truncated differently
plt.setp(axes, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
plt.setp(axes, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
for ax_i, ax in enumerate(axes.flat):
    category = list(tempD1.keys())[ax_i]
    #Prepare DF
    tempDF = tempD1[category]
    xvar = tempD2[category]
    #Scatterplot with regression line
    sns.regplot(data=tempDF, x=xvar, y=yvar, color=tempD3[category],
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.2, 'edgecolor':'k', 's':25}, ax=ax)
    #Draw Y=X as reference
    ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
            color='black', linestyle=(0, (1, 2)), zorder=0)
    #Annotate Pearson's correlation
    pearson_r = tempDF1['Pearson_r'].loc[category]
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF1['AdjPval'].loc[category]
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    #Facet label
    ax.set_title(category.replace(': ', '\n'), {'fontsize':'large'})
    xoff = 0.01
    yoff = 0.01
    rect = plt.Rectangle((xoff, 1+yoff), 1-xoff, 0.21,#Manual adjustment
                         transform=ax.transAxes, facecolor=tempD4[category],
                         clip_on=False, linewidth=0, zorder=0.5)
    ax.add_patch(rect)
    #Axis setting
    if ax_i%len(tempD3)==0:
        plt.setp(ax, xlabel='', ylabel='Measured '+axis_label)
    #elif ax_i==np.median(range(len(tempD3))):
    #    plt.setp(ax, xlabel='Predicted '+axis_label, ylabel='')
    #    plt.setp(ax.get_yticklabels(), visible=False)
    else:
        plt.setp(ax, xlabel='', ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
sns.despine()
#Reset and generate common axis title
#plt.setp(axes, xlabel='', ylabel='')
fig.tight_layout(pad=0.75)
fig.text(x=0.53, y=0.0175,#Manual adjustment
         s='Predicted '+axis_label, fontsize='medium',
         verticalalignment='top', horizontalalignment='center')
#fig.text(x=0.0125, y=0.5,#Manual adjustment
#         s='Measured '+axis_label, fontsize='medium',
#         verticalalignment='center', horizontalalignment='right', rotation='vertical')
##Save
fileDir = './ExportFigures/'
ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
fileName = 'regplot-comparison-BothSex.tif'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                  pil_kwargs={'compression':'tiff_lzw'})
plt.show()

### 5-3. MetBMI variables between the full and restricted versions

In [None]:
print('Full version')
yvar_model = 'MetBMI'
tempD = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Import the cleaned beta-coefficients of full version
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = yvar_model+'-'+sex+'-LASSObcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempDF = tempDF.drop(index=['Intercept'])
    
    tempD[sex] = tempDF
    
    #Check
    print(sex+':')
    print(' - Variables:', len(tempDF))
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')

bcoefDF_full_F = tempD['Female']
bcoefDF_full_M = tempD['Male']
bcoefDF_full_B = tempD['BothSex']

In [None]:
print('Restricted version')
yvar_model = 'MetBMI'
tempD = {}
for sex in ['Female', 'Male', 'BothSex']:
    #Import the cleaned beta-coefficients of restricted version
    fileDir = './ExportData/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = yvar_model+'-'+sex+'-LASSObcoefs.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('Variable')
    tempDF = tempDF.drop(index=['Intercept'])
    
    tempD[sex] = tempDF
    
    #Check
    print(sex+':')
    print(' - Variables:', len(tempDF))
    tempDF1 = tempDF.loc[tempDF['nZeros']!=10]
    print(' - Variables with non-zero beta-coefficient:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')
    tempDF1 = tempDF.loc[tempDF['nZeros']==0]
    print(' - Variables with non-zero beta-coefficient in all 10 models:', len(tempDF1),
          '(', len(tempDF1)/len(tempDF)*100, '%)')

bcoefDF_restricted_F = tempD['Female']
bcoefDF_restricted_M = tempD['Male']
bcoefDF_restricted_B = tempD['BothSex']

In [None]:
#Qualitative comparison for the variables with non-zero beta-coefficient in all 10 models
tempD1 = {'Full version':bcoefDF_full_F, 'Restricted version':bcoefDF_restricted_F}
tempD2 = {'Full version':bcoefDF_full_M, 'Restricted version':bcoefDF_restricted_M}
tempD3 = {'Full version':bcoefDF_full_B, 'Restricted version':bcoefDF_restricted_B}
tempD = {'Female':tempD1, 'Male':tempD2, 'BothSex':tempD3}
tempD2 = {'Full version':'tab:blue', 'Restricted version':'tab:orange'}
analyte = 'metabolites'
title = 'Robustly retained metabolites'

for sex in tempD.keys():
    #Prepare target variables
    tempD1 = tempD[sex]
    
    #Prepare the variable sets
    print(sex)
    print(' - Variables with non-zero beta-coefficient in all 10 models')
    tempD3 = {}
    for version in tempD1.keys():
        tempDF = tempD1[version]
        #Extract robust beta-coefficient: no zeros in all 10 models
        tempDF1 = tempDF.loc[tempDF['nZeros']==0]
        print('   - '+version+':', len(tempDF1), 'per', len(tempDF), analyte+' (',
              len(tempDF1)/len(tempDF)*100, '%)')
        tempD3[version] = set(tempDF1.index.tolist())
    
    #Check common region (1,1)
    tempS = list(tempD3.values())[0]#Initialize
    for version in tempD3.keys():
        tempS = tempS & tempD3[version]
    print('    -> Common (1,1):', len(tempS))
    display(tempS)
    
    #Prepare the reference variable set
    print(' - Cf. Against the shared '+analyte+' (i.e., derived from the restricted panel)')
    tempD4 = {}
    for version in tempD3.keys():
        tempS = tempD3[version]
        for version2 in tempD1.keys():
            tempDF = tempD1[version2]
            tempS = tempS & set(tempDF.index.tolist())
        print('   - '+version+':', len(tempS), 'per common '+analyte)
        tempD4[version] = tempS
    
    #Dummy venn diagram for the reference variable set to save the numbers in each region
    sns.set(style='ticks', font='Arial', context='talk')
    fig, ax = plt.subplots(figsize=(4, 4))
    venn(tempD4, fmt='{size:,}', cmap=list(tempD2.values()), legend_loc=None, ax=ax)
    plt.setp(ax, ylim=(0.2, 0.8))#Otherwise, weird space...
    ##Add legend annotation
    x_coord = [0.45, 0.55]
    y_coord = [0.6, 0.4]
    h_align = ['right', 'left']
    v_align = ['bottom', 'top']
    for i in range(len(tempD2)):
        key = list(tempD2.keys())[i]
        total = f'{len(tempD4[key]):,}'
        ax.text(x_coord[i], y_coord[i], key+'\n('+total+' '+analyte+')',
                fontsize='small', multialignment='center',
                horizontalalignment=h_align[i], verticalalignment=v_align[i],
                bbox={'boxstyle':'round', 'facecolor':tempD2[key], 'pad':0.2, 'alpha':0.5})
    ax.set_title(title, fontsize='medium')
    plt.show()
    ##Save the numbers in each region
    tempL = []
    for text in ax.texts:
        tempL.append(text.get_text())
    
    #Venn diagram
    sns.set(style='ticks', font='Arial', context='talk')
    fig, ax = plt.subplots(figsize=(4, 4))
    venn(tempD3, fmt='{size:,}', cmap=list(tempD2.values()), legend_loc=None, ax=ax)
    plt.setp(ax, ylim=(0.2, 0.8))#Otherwise, weird space...
    ##Add reference numbers
    for text_i, text in enumerate(ax.texts):
        count = text.get_text()
        count_ref = tempL[text_i]
        if count!=count_ref:
            text.set_text(count+' ['+count_ref+']')
    ##Add legend annotation
    x_coord = [0.45, 0.55]
    y_coord = [0.6, 0.4]
    h_align = ['right', 'left']
    v_align = ['bottom', 'top']
    for i in range(len(tempD2)):
        key = list(tempD2.keys())[i]
        total = f'{len(tempD3[key]):,}'
        total_ref = f'{len(tempD4[key]):,}'
        if total!=total_ref:
            text = key+'\n('+total+' ['+total_ref+'] '+analyte+')'
        else:
            text = key+'\n('+total+' '+analyte+')'
        ax.text(x_coord[i], y_coord[i], text,
                fontsize='small', multialignment='center',
                horizontalalignment=h_align[i], verticalalignment=v_align[i],
                bbox={'boxstyle':'round', 'facecolor':tempD2[key], 'pad':0.2, 'alpha':0.5})
    ax.set_title(title, fontsize='medium')
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = 'venn-'+analyte+'-'+sex+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()

In [None]:
#Quantitative comparison for the variables with non-zero beta-coefficient in all 10 models
tempD1 = {'Female':bcoefDF_full_F, 'Male':bcoefDF_full_M, 'BothSex':bcoefDF_full_B}
tempD2 = {'Female':bcoefDF_restricted_F, 'Male':bcoefDF_restricted_M, 'BothSex':bcoefDF_restricted_B}
xvar = 'Full version'
yvar = 'Restricted version'
vartype_text = 'Robustly retained in'
analyte = 'metabolites'

for sex in tempD1.keys():
    tempDF1 = tempD1[sex]
    tempDF2 = tempD2[sex]
    
    #Prepare plot DF
    tempS1 = tempDF1['Mean']
    tempS1.name = xvar
    tempS2 = tempDF2['Mean']
    tempS2.name = yvar
    ##Merge while taking common analytes
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    ##Variables with non-zero beta-coefficients across all models in full or restricted version
    tempDF1 = tempDF1.loc[tempDF1.index.isin(tempDF.index.tolist())]
    tempDF1 = tempDF1.loc[tempDF1['nZeros']==0]
    tempS1 = set(tempDF1.index.tolist())
    tempDF2 = tempDF2.loc[tempDF2.index.isin(tempDF.index.tolist())]
    tempDF2 = tempDF2.loc[tempDF2['nZeros']==0]
    tempS2 = set(tempDF2.index.tolist())
    tempDF = tempDF.loc[(tempS1 | tempS2)]
    ##Variable type
    tempS3 = tempS1 & tempS2
    tempS1 = tempS1 - tempS3
    tempS2 = tempS2 - tempS3
    tempS1 = pd.Series(np.repeat(xvar.lower(), len(tempS1)), index=tempS1, name=vartype_text)
    tempS2 = pd.Series(np.repeat(yvar.lower(), len(tempS2)), index=tempS2, name=vartype_text)
    tempS3 = pd.Series(np.repeat('both versions', len(tempS3)), index=tempS3, name=vartype_text)
    tempS = pd.concat([tempS1, tempS2, tempS3], axis=0)
    tempDF = pd.merge(tempDF, tempS, left_index=True, right_index=True, how='left')
    print(sex)
    display(tempDF.describe(include='all'))
    display(tempDF[vartype_text].value_counts())
    
    #Scatterplot with regression line
    sns.set(style='ticks', font='Arial', context='talk')
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(3.5, 3.5))#To set axis range first
    axis_xymin = -0.0275
    axis_xymax = 0.0525
    xymin = -0.02
    xymax = 0.04
    xyinter = 0.02
    plt.setp(ax, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    plt.setp(ax, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    sns.regplot(data=tempDF, x=xvar, y=yvar, color='k',
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.4, 'edgecolor':'k', 's':50}, ax=ax)
    #Highlight the robustly retained analytes in both versions
    tempDF1 = tempDF.loc[tempDF[vartype_text]=='both versions']
    sns.scatterplot(data=tempDF1, x=xvar, y=yvar, color='red',#Darker due to overlapping
                    marker='o', alpha=0.4, edgecolor='k', s=50, ax=ax)
    sns.despine()
    #Draw Y=X=0 as reference
    ax.axvline(x=0, **{'linestyle':(0, (1, 2)), 'color':'black', 'zorder':0})
    ax.axhline(y=0, **{'linestyle':(0, (1, 2)), 'color':'black', 'zorder':0})
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[xvar], tempDF[yvar])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    ##Axis title -> Using facet title for the common part
    plt.setp(ax, xlabel=xvar, ylabel=yvar)
    plt.setp(ax, title='Mean of '+r'$\beta$'+'-coefficients')
    ##Legend for the highlighting markers
    legend_label = vartype_text.replace('ed in','ed\nin ')+'both versions \n('+f'{len(tempDF1):,}'+' '+analyte+')'
    legend = mlines.Line2D([], [], color='white',
                           marker='o', markerfacecolor='r', markeredgecolor='k',
                           markersize=10, label=legend_label)
    ax.legend(handles=[legend], fontsize='x-small', title='', title_fontsize='medium',
              bbox_to_anchor=(1, 0.025), loc='lower right', borderaxespad=0.0,
              handlelength=1.0, handletextpad=0.4, frameon=True)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = 'bcoef-difference_non-zero-in-all-'+sex+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    print('')

### 5-4. MetBMI predictions between the full and restricted versions

In [None]:
#Import predictions by the full version
yvar_model = 'MetBMI'
yvar = 'BMI'
tempD = {}
for sex in ['Female', 'Male', 'BothSex']:
    fileDir = './ExportData/'
    ipynbName = '220801_Multiomics-BMI-NatMed1stRevision_BMI-baseline-LASSO_'
    fileName = yvar_model+'-'+sex+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id':str})
    tempDF = tempDF.set_index('public_client_id')
    #Change column names
    tempDF.columns = tempDF.columns.str.replace('Base', '')
    
    tempD[sex] = tempDF
    
    #Check
    print(sex+':')
    display(tempDF)
    print(' - Observed vs. predicted log_'+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF['log_'+yvar], tempDF['log_'+yvar_model]))
    print(' - Observed vs. predicted '+yvar+': (Pearson\'s r, P) =',
          stats.pearsonr(tempDF[yvar], tempDF[yvar_model]))
    display(tempDF.describe())

metBMI_a_F_full = tempD['Female']
metBMI_a_M_full = tempD['Male']
metBMI_a_B_full = tempD['BothSex']

In [None]:
#Prediction comparison between the full and restricted versions
tempD1 = {'Female':metBMI_a_F_full, 'Male':metBMI_a_M_full, 'BothSex':metBMI_a_B_full}
tempD2 = {'Female':metBMI_a_F, 'Male':metBMI_a_M, 'BothSex':metBMI_a_B}
yvar_model = 'MetBMI'
xvar = 'Full version'
yvar = 'Restricted version'
model_color = 'b'
title_label = yvar_model+' [kg m'+r'$^{-2}$'+']'

for sex in tempD1.keys():
    tempDF1 = tempD1[sex]
    tempDF2 = tempD2[sex]
    
    #Prepare plot DF
    tempS1 = tempDF1[yvar_model]
    tempS1.name = xvar
    tempS2 = tempDF2[yvar_model]
    tempS2.name = yvar
    tempDF = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='inner')
    print(sex)
    display(tempDF.describe(include='all'))
    
    #Scatterplot with regression line
    sns.set(style='ticks', font='Arial', context='talk')
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(3.5, 3.5))#To set axis range first
    axis_xymin = 12.5
    axis_xymax = 47.5
    xymin = 20
    xymax = 40
    xyinter = 10
    plt.setp(ax, xlim=(axis_xymin, axis_xymax), xticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    plt.setp(ax, ylim=(axis_xymin, axis_xymax), yticks=np.arange(xymin, xymax+xyinter/10, xyinter))
    sns.regplot(data=tempDF, x=xvar, y=yvar, color=model_color,
                scatter=True, fit_reg=True, ci=95, truncate=False, marker='o',
                scatter_kws={'alpha':0.4, 'edgecolor':'k', 's':50}, ax=ax)
    sns.despine()
    #Draw Y=X as reference
    ax.plot([axis_xymin, axis_xymax], [axis_xymin, axis_xymax],
            color='black', linestyle=(0, (1, 2)), zorder=0)
    ##Annotate Pearson's correlation
    pearson_r, pval = stats.pearsonr(tempDF[xvar], tempDF[yvar])
    r_text = str(Decimal(str(pearson_r)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the float minimum
            pval = sys.float_info.min
            print('P-value was smaller than the float minimum:', pval)
            below_limit = 1
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $'...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' < '+pval_text
    else:
        text = 'Pearson\'s '+r'$r$'+' = '+r_text+'\n'+r'$P$'+' = '+pval_text
    ax.annotate(text, xy=(0.05, 0.95), xycoords='axes fraction',
                horizontalalignment='left', verticalalignment='top',
                multialignment='left', fontsize='small', color='k')
    ##Axis title -> Using facet title for the common part
    plt.setp(ax, xlabel=xvar, ylabel=yvar)
    plt.setp(ax, title=title_label)
    ##Save
    fileDir = './ExportFigures/'
    ipynbName = '220918_Multiomics-BMI-NatMed1stRevision_TwinsUK-BMI-LASSO-ver3_'
    fileName = yvar_model+'-difference-'+sex+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    print('')

# — End of this notebook —