# Multiomics BMI Paper — Longitudinal Change of Biological BMI during the Arivale Program

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) estimated longitudinal changes of the measured and omics-inferred BMIs during the Arivale program, using linear mixed model (LMM) with random effects for each participant.  

Input files:  
* Arivale logitudinal BMI predictions: 220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_\[MetBMI/ProtBMI/ChemBMI\]-\[BothSex/FemaleMale\].tsv  
* Arivale time-series BMI and blood omics: 210104_Biological-BMI-paper_data-cleaning-BMI-omics_time-series-\[bmiDF/combiDF\]-without-imputation_final-cohort.tsv  

Output figures and tables:  
* Figure 5  

Original notebook (memo for my future tracing):  
* dalek:\[JupyterLab HOME\]/220621_Multiomics-BMI-NatMedRevision/220806_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LMM-ver2.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time

from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import matplotlib.lines as mlines

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

## 1. Metabolomics

In [None]:
yvar = 'MetBMI'
yvar_color = 'b'

### 1-1. Prepare the time-series DF

In [None]:
#Import time-series biological BMI
tempD = {'Sex-mixed '+yvar+' model':'BothSex', 'Sex-stratified '+yvar+' model':'FemaleMale'}
tempD1 = {}
for model_sex in tempD.keys():
    fileDir = './ExportData/'
    ipynbName = '220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_'
    fileName = yvar+'-'+tempD[model_sex]+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('KeyIndex')
    tempD1[model_sex] = tempDF
    
    print(model_sex)
    display(tempDF)
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    print('')

tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']

### 1-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
#Check measurement distribution
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(4, 3))
sns.distplot(tsDF_BS['days_in_program'], color=yvar_color)
sns.despine()
for knot in [0, 6, 12, 18]:
    plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
plt.ylabel('Density')
plt.xlabel('Days in program')
plt.show()

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
scaler_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']
scaler_FM = tempD2['Sex-stratified '+yvar+' model']

### 1-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM1':'Overall LMM',
              'LMM2':'Normal class LMM',
              'LMM3':'Overweight class LMM',
              'LMM4':'Obese class LMM'}
    tempD3 = {'LMM1':tempDF,
              'LMM2':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM3':tempDF.loc[tempDF['BaseBMI_class']=='Overweight'],
              'LMM4':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM1_BS = tempD1['Sex-mixed '+yvar+' model - LMM1']
LMM2_BS = tempD1['Sex-mixed '+yvar+' model - LMM2']
LMM3_BS = tempD1['Sex-mixed '+yvar+' model - LMM3']
LMM4_BS = tempD1['Sex-mixed '+yvar+' model - LMM4']
LMM1_FM = tempD1['Sex-stratified '+yvar+' model - LMM1']
LMM2_FM = tempD1['Sex-stratified '+yvar+' model - LMM2']
LMM3_FM = tempD1['Sex-stratified '+yvar+' model - LMM3']
LMM4_FM = tempD1['Sex-stratified '+yvar+' model - LMM4']

### 1-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':scaler_BS, 'Sex-stratified '+yvar+' model':scaler_FM}
tempD3 = {'Sex-mixed '+yvar+' model':LMM1_BS, 'Sex-stratified '+yvar+' model':LMM1_FM}
tempD4 = {'Sex-mixed '+yvar+' model':LMM2_BS, 'Sex-stratified '+yvar+' model':LMM2_FM}
tempD5 = {'Sex-mixed '+yvar+' model':LMM3_BS, 'Sex-stratified '+yvar+' model':LMM3_FM}
tempD6 = {'Sex-mixed '+yvar+' model':LMM4_BS, 'Sex-stratified '+yvar+' model':LMM4_FM}
tempD4 = {'Normal':tempD4, 'Overweight':tempD5, 'Obese':tempD6}
tempD5 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 1 prediction
    fit_res = tempD3[model_sex]
    ##Predicted component from the fixed effects
    tempS = fit_res.predict(tempDF)#Automatically recognize the column name in formula
    ##Create design matrix for the random effects
    tempDF1 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF)),
                            'Slope_re':tempDF['DaysZ']})
    tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
    ##Predicted component from the random effects
    tempL = [np.dot(tempDF1.iloc[row_i], tempD[group_n])
             for (row_i, group_n) in enumerate(tempDF['public_client_id'].tolist())]
    ##Total prediction for in-sample individual
    tempS = tempS + tempL
    ##Convert to original scale
    tempDF[yvar+'_predicted1'] = np.e**tempS
    
    #Model 2-4 prediction
    for model_i, bmi_class in enumerate(tempD4.keys(), start=2):
        tempD = tempD4[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted1':'mean',
                         yvar+'_predicted2':'mean',
                         yvar+'_predicted3':'mean',
                         yvar+'_predicted4':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the baseline BMI class to use later
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD5[model_sex] = tempDF

metLMM_BS = tempD5['Sex-mixed '+yvar+' model']
metLMM_FM = tempD5['Sex-stratified '+yvar+' model']

In [None]:
#Check visually
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':metLMM_BS, 'Sex-stratified '+yvar+' model':metLMM_FM}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot1
    print(' - Model 1: Overall LMM')
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    sns.lineplot(data=tempDF1, x='days_in_program', y=yvar,
                 units='public_client_id', estimator=None, color='gray', lw=1, alpha=0.1, legend=None)
    sns.scatterplot(data=tempDF1, x='days_in_program', y=yvar,
                    color='gray', edgecolor='k', s=20, alpha=0.3)
    p = sns.lineplot(data=tempDF2, x='days_in_program', y=yvar+'_predicted1',
                     estimator='mean', ci=95, color=yvar_color)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.show()
    print('')
    
    #Plot2
    print(' - Model 2-4: Baseline BMI class-stratified LMM')
    tempD = {'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    p = sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']!='Underweight'],
                        x='days_in_program', y=yvar,
                        hue='BaseBMI_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
    for model_i, bmi_class in enumerate(tempD.keys(), start=2):
        sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                     x='days_in_program', y=yvar+'_predicted'+str(model_i),
                     color=tempD[bmi_class], estimator='mean', ci=95, ax=p)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    print('')
    
    print('')

## 2. Proteomics

In [None]:
yvar = 'ProtBMI'
yvar_color = 'r'

### 2-1. Prepare the time-series DF

In [None]:
#Import time-series biological BMI
tempD = {'Sex-mixed '+yvar+' model':'BothSex', 'Sex-stratified '+yvar+' model':'FemaleMale'}
tempD1 = {}
for model_sex in tempD.keys():
    fileDir = './ExportData/'
    ipynbName = '220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_'
    fileName = yvar+'-'+tempD[model_sex]+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('KeyIndex')
    tempD1[model_sex] = tempDF
    
    print(model_sex)
    display(tempDF)
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    print('')

tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']

### 2-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
#Check measurement distribution
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(4, 3))
sns.distplot(tsDF_BS['days_in_program'], color=yvar_color)
sns.despine()
for knot in [0, 6, 12, 18]:
    plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
plt.ylabel('Density')
plt.xlabel('Days in program')
plt.show()

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
scaler_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']
scaler_FM = tempD2['Sex-stratified '+yvar+' model']

### 2-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM1':'Overall LMM',
              'LMM2':'Normal class LMM',
              'LMM3':'Overweight class LMM',
              'LMM4':'Obese class LMM'}
    tempD3 = {'LMM1':tempDF,
              'LMM2':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM3':tempDF.loc[tempDF['BaseBMI_class']=='Overweight'],
              'LMM4':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM1_BS = tempD1['Sex-mixed '+yvar+' model - LMM1']
LMM2_BS = tempD1['Sex-mixed '+yvar+' model - LMM2']
LMM3_BS = tempD1['Sex-mixed '+yvar+' model - LMM3']
LMM4_BS = tempD1['Sex-mixed '+yvar+' model - LMM4']
LMM1_FM = tempD1['Sex-stratified '+yvar+' model - LMM1']
LMM2_FM = tempD1['Sex-stratified '+yvar+' model - LMM2']
LMM3_FM = tempD1['Sex-stratified '+yvar+' model - LMM3']
LMM4_FM = tempD1['Sex-stratified '+yvar+' model - LMM4']

### 2-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':scaler_BS, 'Sex-stratified '+yvar+' model':scaler_FM}
tempD3 = {'Sex-mixed '+yvar+' model':LMM1_BS, 'Sex-stratified '+yvar+' model':LMM1_FM}
tempD4 = {'Sex-mixed '+yvar+' model':LMM2_BS, 'Sex-stratified '+yvar+' model':LMM2_FM}
tempD5 = {'Sex-mixed '+yvar+' model':LMM3_BS, 'Sex-stratified '+yvar+' model':LMM3_FM}
tempD6 = {'Sex-mixed '+yvar+' model':LMM4_BS, 'Sex-stratified '+yvar+' model':LMM4_FM}
tempD4 = {'Normal':tempD4, 'Overweight':tempD5, 'Obese':tempD6}
tempD5 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 1 prediction
    fit_res = tempD3[model_sex]
    ##Predicted component from the fixed effects
    tempS = fit_res.predict(tempDF)#Automatically recognize the column name in formula
    ##Create design matrix for the random effects
    tempDF1 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF)),
                            'Slope_re':tempDF['DaysZ']})
    tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
    ##Predicted component from the random effects
    tempL = [np.dot(tempDF1.iloc[row_i], tempD[group_n])
             for (row_i, group_n) in enumerate(tempDF['public_client_id'].tolist())]
    ##Total prediction for in-sample individual
    tempS = tempS + tempL
    ##Convert to original scale
    tempDF[yvar+'_predicted1'] = np.e**tempS
    
    #Model 2-4 prediction
    for model_i, bmi_class in enumerate(tempD4.keys(), start=2):
        tempD = tempD4[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted1':'mean',
                         yvar+'_predicted2':'mean',
                         yvar+'_predicted3':'mean',
                         yvar+'_predicted4':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the baseline BMI class to use later
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD5[model_sex] = tempDF

protLMM_BS = tempD5['Sex-mixed '+yvar+' model']
protLMM_FM = tempD5['Sex-stratified '+yvar+' model']

In [None]:
#Check visually
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':protLMM_BS, 'Sex-stratified '+yvar+' model':protLMM_FM}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot1
    print(' - Model 1: Overall LMM')
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    sns.lineplot(data=tempDF1, x='days_in_program', y=yvar,
                 units='public_client_id', estimator=None, color='gray', lw=1, alpha=0.1, legend=None)
    sns.scatterplot(data=tempDF1, x='days_in_program', y=yvar,
                    color='gray', edgecolor='k', s=20, alpha=0.3)
    p = sns.lineplot(data=tempDF2, x='days_in_program', y=yvar+'_predicted1',
                     estimator='mean', ci=95, color=yvar_color)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.show()
    print('')
    
    #Plot2
    print(' - Model 2-4: Baseline BMI class-stratified LMM')
    tempD = {'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    p = sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']!='Underweight'],
                        x='days_in_program', y=yvar,
                        hue='BaseBMI_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
    for model_i, bmi_class in enumerate(tempD.keys(), start=2):
        sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                     x='days_in_program', y=yvar+'_predicted'+str(model_i),
                     color=tempD[bmi_class], estimator='mean', ci=95, ax=p)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    print('')
    
    print('')

## 3. Clinical labs

In [None]:
yvar = 'ChemBMI'
yvar_color = 'g'

### 3-1. Prepare the time-series DF

In [None]:
#Import time-series biological BMI
tempD = {'Sex-mixed '+yvar+' model':'BothSex', 'Sex-stratified '+yvar+' model':'FemaleMale'}
tempD1 = {}
for model_sex in tempD.keys():
    fileDir = './ExportData/'
    ipynbName = '220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_'
    fileName = yvar+'-'+tempD[model_sex]+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('KeyIndex')
    tempD1[model_sex] = tempDF
    
    print(model_sex)
    display(tempDF)
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    print('')

tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']

### 3-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
#Check measurement distribution
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(4, 3))
sns.distplot(tsDF_BS['days_in_program'], color=yvar_color)
sns.despine()
for knot in [0, 6, 12, 18]:
    plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
plt.ylabel('Density')
plt.xlabel('Days in program')
plt.show()

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
scaler_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']
scaler_FM = tempD2['Sex-stratified '+yvar+' model']

### 3-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM1':'Overall LMM',
              'LMM2':'Normal class LMM',
              'LMM3':'Overweight class LMM',
              'LMM4':'Obese class LMM'}
    tempD3 = {'LMM1':tempDF,
              'LMM2':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM3':tempDF.loc[tempDF['BaseBMI_class']=='Overweight'],
              'LMM4':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM1_BS = tempD1['Sex-mixed '+yvar+' model - LMM1']
LMM2_BS = tempD1['Sex-mixed '+yvar+' model - LMM2']
LMM3_BS = tempD1['Sex-mixed '+yvar+' model - LMM3']
LMM4_BS = tempD1['Sex-mixed '+yvar+' model - LMM4']
LMM1_FM = tempD1['Sex-stratified '+yvar+' model - LMM1']
LMM2_FM = tempD1['Sex-stratified '+yvar+' model - LMM2']
LMM3_FM = tempD1['Sex-stratified '+yvar+' model - LMM3']
LMM4_FM = tempD1['Sex-stratified '+yvar+' model - LMM4']

### 3-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':scaler_BS, 'Sex-stratified '+yvar+' model':scaler_FM}
tempD3 = {'Sex-mixed '+yvar+' model':LMM1_BS, 'Sex-stratified '+yvar+' model':LMM1_FM}
tempD4 = {'Sex-mixed '+yvar+' model':LMM2_BS, 'Sex-stratified '+yvar+' model':LMM2_FM}
tempD5 = {'Sex-mixed '+yvar+' model':LMM3_BS, 'Sex-stratified '+yvar+' model':LMM3_FM}
tempD6 = {'Sex-mixed '+yvar+' model':LMM4_BS, 'Sex-stratified '+yvar+' model':LMM4_FM}
tempD4 = {'Normal':tempD4, 'Overweight':tempD5, 'Obese':tempD6}
tempD5 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 1 prediction
    fit_res = tempD3[model_sex]
    ##Predicted component from the fixed effects
    tempS = fit_res.predict(tempDF)#Automatically recognize the column name in formula
    ##Create design matrix for the random effects
    tempDF1 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF)),
                            'Slope_re':tempDF['DaysZ']})
    tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
    ##Predicted component from the random effects
    tempL = [np.dot(tempDF1.iloc[row_i], tempD[group_n])
             for (row_i, group_n) in enumerate(tempDF['public_client_id'].tolist())]
    ##Total prediction for in-sample individual
    tempS = tempS + tempL
    ##Convert to original scale
    tempDF[yvar+'_predicted1'] = np.e**tempS
    
    #Model 2-4 prediction
    for model_i, bmi_class in enumerate(tempD4.keys(), start=2):
        tempD = tempD4[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted1':'mean',
                         yvar+'_predicted2':'mean',
                         yvar+'_predicted3':'mean',
                         yvar+'_predicted4':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the baseline BMI class to use later
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD5[model_sex] = tempDF

chemLMM_BS = tempD5['Sex-mixed '+yvar+' model']
chemLMM_FM = tempD5['Sex-stratified '+yvar+' model']

In [None]:
#Check visually
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':chemLMM_BS, 'Sex-stratified '+yvar+' model':chemLMM_FM}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot1
    print(' - Model 1: Overall LMM')
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    sns.lineplot(data=tempDF1, x='days_in_program', y=yvar,
                 units='public_client_id', estimator=None, color='gray', lw=1, alpha=0.1, legend=None)
    sns.scatterplot(data=tempDF1, x='days_in_program', y=yvar,
                    color='gray', edgecolor='k', s=20, alpha=0.3)
    p = sns.lineplot(data=tempDF2, x='days_in_program', y=yvar+'_predicted1',
                     estimator='mean', ci=95, color=yvar_color)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.show()
    print('')
    
    #Plot2
    print(' - Model 2-4: Baseline BMI class-stratified LMM')
    tempD = {'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    p = sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']!='Underweight'],
                        x='days_in_program', y=yvar,
                        hue='BaseBMI_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
    for model_i, bmi_class in enumerate(tempD.keys(), start=2):
        sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                     x='days_in_program', y=yvar+'_predicted'+str(model_i),
                     color=tempD[bmi_class], estimator='mean', ci=95, ax=p)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    print('')
    
    print('')

## 4. Measured BMI

In [None]:
yvar = 'BMI'
yvar_color = 'k'

### 4-1. Prepare the time-series DF

In [None]:
#Import time-series BMI
fileDir = '../210104_Biological-BMI-paper/ExportData/'
ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
tempDF = tempDF.set_index('KeyIndex')
##Chage column name
tempDF = tempDF.rename(columns={'BMI_CALC':'BMI'})

#Add obesity classification (based on BMI)
tempDF['BaseBMI'] = np.e**tempDF['log_BaseBMI']
tempDF = tempDF.drop(columns=['log_BaseBMI'])
tempL = []
for row_i in range(len(tempDF)):
    value = tempDF['BaseBMI'].iloc[row_i]
    if np.isnan(value):
        tempL.append('NotCalculated')
    elif value < 18.5:
        tempL.append('Underweight')
    elif value < 25:
        tempL.append('Normal')
    elif value < 30:
        tempL.append('Overweight')
    elif value >= 30:
        tempL.append('Obese')
    else:#Just in case
        tempL.append('Error?')
tempDF['BaseBMI_class'] = tempL
##Confirmation
print('BaseBMI_class:')
tempDF1 = tempDF.drop_duplicates('public_client_id', keep='first')
tempS = tempDF1['BaseBMI_class'].value_counts()
tempDF1 = pd.DataFrame({'Count':tempS, 'Percentage':tempS/len(tempDF1)*100})
display(tempDF1)

display(tempDF)
print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))

tsDF = tempDF

### 4-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
#Check measurement distribution
sns.set(style='ticks', font='Arial', context='notebook')
plt.figure(figsize=(4, 3))
sns.distplot(tsDF['days_in_program'], color=yvar_color)
sns.despine()
for knot in [0, 6, 12, 18]:
    plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
plt.ylabel('Density')
plt.xlabel('Days in program')
plt.show()

In [None]:
tempD = {'Sex-independent '+yvar+' measurement':tsDF}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF = tempD1['Sex-independent '+yvar+' measurement']
scaler = tempD2['Sex-independent '+yvar+' measurement']

### 4-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-independent '+yvar+' measurement':tsDF}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM1':'Overall LMM',
              'LMM2':'Normal class LMM',
              'LMM3':'Overweight class LMM',
              'LMM4':'Obese class LMM'}
    tempD3 = {'LMM1':tempDF,
              'LMM2':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM3':tempDF.loc[tempDF['BaseBMI_class']=='Overweight'],
              'LMM4':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM1 = tempD1['Sex-independent '+yvar+' measurement - LMM1']
LMM2 = tempD1['Sex-independent '+yvar+' measurement - LMM2']
LMM3 = tempD1['Sex-independent '+yvar+' measurement - LMM3']
LMM4 = tempD1['Sex-independent '+yvar+' measurement - LMM4']

### 4-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-independent '+yvar+' measurement':tsDF}
tempD2 = {'Sex-independent '+yvar+' measurement':scaler}
tempD3 = {'Sex-independent '+yvar+' measurement':LMM1}
tempD4 = {'Sex-independent '+yvar+' measurement':LMM2}
tempD5 = {'Sex-independent '+yvar+' measurement':LMM3}
tempD6 = {'Sex-independent '+yvar+' measurement':LMM4}
tempD4 = {'Normal':tempD4, 'Overweight':tempD5, 'Obese':tempD6}
tempD5 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 1 prediction
    fit_res = tempD3[model_sex]
    ##Predicted component from the fixed effects
    tempS = fit_res.predict(tempDF)#Automatically recognize the column name in formula
    ##Create design matrix for the random effects
    tempDF1 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF)),
                            'Slope_re':tempDF['DaysZ']})
    tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
    ##Predicted component from the random effects
    tempL = [np.dot(tempDF1.iloc[row_i], tempD[group_n])
             for (row_i, group_n) in enumerate(tempDF['public_client_id'].tolist())]
    ##Total prediction for in-sample individual
    tempS = tempS + tempL
    ##Convert to original scale
    tempDF[yvar+'_predicted1'] = np.e**tempS
    
    #Model 2-4 prediction
    for model_i, bmi_class in enumerate(tempD4.keys(), start=2):
        tempD = tempD4[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted1':'mean',
                         yvar+'_predicted2':'mean',
                         yvar+'_predicted3':'mean',
                         yvar+'_predicted4':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the baseline BMI class to use later
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'BaseBMI_class']
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD5[model_sex] = tempDF

measLMM = tempD5['Sex-independent '+yvar+' measurement']

In [None]:
#Check visually
tempD1 = {'Sex-independent '+yvar+' measurement':tsDF}
tempD2 = {'Sex-independent '+yvar+' measurement':measLMM}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot1
    print(' - Model 1: Overall LMM')
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    sns.lineplot(data=tempDF1, x='days_in_program', y=yvar,
                 units='public_client_id', estimator=None, color='gray', lw=1, alpha=0.1, legend=None)
    sns.scatterplot(data=tempDF1, x='days_in_program', y=yvar,
                    color='gray', edgecolor='k', s=20, alpha=0.3)
    p = sns.lineplot(data=tempDF2, x='days_in_program', y=yvar+'_predicted1',
                     estimator='mean', ci=95, color=yvar_color)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.show()
    print('')
    
    #Plot2
    print(' - Model 2-4: Baseline BMI class-stratified LMM')
    tempD = {'Normal':'green', 'Overweight':'orange', 'Obese':'red'}
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(5, 3))
    p = sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']!='Underweight'],
                        x='days_in_program', y=yvar,
                        hue='BaseBMI_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
    for model_i, bmi_class in enumerate(tempD.keys(), start=2):
        sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                     x='days_in_program', y=yvar+'_predicted'+str(model_i),
                     color=tempD[bmi_class], estimator='mean', ci=95, ax=p)
    p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
          ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
    sns.despine()
    plt.xlabel('Days in program')
    plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    print('')
    
    print('')

## 5. Comparison of estimates b/w BMI types

### 5-1. Overall cohort and baseline BMI class stratification

In [None]:
#For presentation
tempD = {'BMI':'k', 'MetBMI':'b', 'ProtBMI':'r', 'ChemBMI':'g'}
tempL1 = [metLMM_BS, protLMM_BS, chemLMM_BS]
tempL2 = [metLMM_FM, protLMM_FM, chemLMM_FM]
tempD1 = {'Sex-mixed bBMI model':tempL1, 'Sex-stratified bBMI model':tempL2}
tempD2 = {'Sex-mixed bBMI model':'BothSex', 'Sex-stratified bBMI model':'FemaleMale'}

for model_sex in tempD1.keys():
    print(model_sex)
    
    #Merge prediction DFs
    tempDF = measLMM
    tempL = tempD1[model_sex]
    for tempDF1 in tempL:
        tempDF1 = tempDF1.loc[:, ~tempDF1.columns.isin(tempDF.columns.tolist())]
        tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='outer')
    
    #Calculate longitudinal change
    tempL = tempDF.loc[:, tempDF.columns.str.contains('_predicted')].columns.tolist()
    for col_n in tempL:
        tempDF1 = tempDF[['public_client_id', 'days_in_program', col_n]].dropna()
        #Retrieve the estimate at days = 0 for each participant
        tempDF2 = tempDF1.loc[tempDF1['days_in_program']==0.0]
        tempS = tempDF2.reset_index().set_index('public_client_id')[col_n]
        tempS.name = 'day0'
        #Merge
        tempDF1 = pd.merge(tempDF1, tempS, left_on='public_client_id', right_index=True, how='left')
        #Calculate longitudinal change [%]
        label = col_n.replace('_predicted', 'change_predicted')
        tempDF1[label] = (tempDF1[col_n] - tempDF1['day0']) / tempDF1['day0'] * 100
        #Add to the working DF
        tempDF = pd.merge(tempDF, tempDF1[label], left_index=True, right_index=True, how='left')
    
    #Model 1
    print(' - Model 1: Overall LMM')
    ##Prepare DF
    tempL = tempDF.loc[:, tempDF.columns.str.contains('change_predicted1')].columns.tolist()
    tempDF1 = tempDF.reset_index().melt(var_name='LMM', value_name='PredictedChange', value_vars=tempL,
                                        id_vars=['public_client_id', 'days_in_program', 'BaseBMI_class'])
    tempDF1 = tempDF1.dropna()
    tempDF1['LMM'] = tempDF1['LMM'].str.replace('change_predicted1', '')
    ##Check sample size
    tempL = []
    for yvar in tempD.keys():
        tempA = tempDF1['public_client_id'].loc[tempDF1['LMM']==yvar].unique()
        tempL.append(yvar+' (n = '+str(len(tempA))+')')
    print('  -> Confirm each sample size:', tempL)
    ##Visualization
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4.5, 3))
    p = sns.lineplot(data=tempDF1, x='days_in_program', y='PredictedChange',
                     hue='LMM', hue_order=tempD.keys(), palette=tempD, estimator='mean', ci=95)
    p.set(xlim=(0, 365.25/12*12), xticks=np.arange(0, 365.25/12*12+1, 90),
          ylim=(-7, 1), yticks=np.arange(-6, 0.1, 2))
    sns.despine()
    p.set_title('Overall cohort', {'fontsize':'large'})
    plt.xlabel('Days in program')
    plt.ylabel('Estimated change [%]')
    plt.axhline(y=0, **{'linestyle':'--', 'color':'k'}, zorder=0)
    #for knot in range(nknots-1):
    #    if knot%2 == 1:
    #        plt.axvspan(xmin=365.25/12*month_threshold/nknots*knot,
    #                    xmax=365.25/12*month_threshold/nknots*(knot+1),
    #                    facecolor='gray', alpha=0.2, zorder=-1)
    ###Generate standard Matplotlib legend manually
    tempL = []
    for yvar in tempD.keys():
        tempL.append(mlines.Line2D([], [], color=tempD[yvar], label=yvar, linewidth=3))
    plt.legend(handles=tempL, fontsize='medium', title='LMM estimate', title_fontsize='large',
               bbox_to_anchor=(0.5, 0), loc='upper center', ncol=2, borderaxespad=4,
               handlelength=1.5, handletextpad=0.5, columnspacing=1.0, frameon=True)
    ###Save
    fileDir = './ExportFigures/'
    ipynbName = '220806_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LMM-ver2_'
    fileName = 'overall-'+tempD2[model_sex]+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    
    #Model 2-4
    print(' - Model 2-4: Baseline BMI class-stratified LMM')
    tempL1 = ['Normal', 'Overweight', 'Obese']
    ##Prepare DF
    tempL = tempDF.loc[:, tempDF.columns.str.contains('change_predicted[234]')].columns.tolist()
    tempDF1 = tempDF.reset_index().melt(var_name='LMM', value_name='PredictedChange', value_vars=tempL,
                                        id_vars=['public_client_id', 'days_in_program', 'BaseBMI_class'])
    tempDF1 = tempDF1.dropna()
    tempDF1['LMM'] = tempDF1['LMM'].str.replace('change_predicted[234]', '')
    ##Check sample size
    for bmi_class in tempL1:
        print('   - '+bmi_class+' class based on the baseline BMI')
        tempDF2 = tempDF1.loc[tempDF1['BaseBMI_class']==bmi_class]
        tempL = []
        for yvar in tempD.keys():
            tempA = tempDF2['public_client_id'].loc[tempDF2['LMM']==yvar].unique()
            tempL.append(yvar+' (n = '+str(len(tempA))+')')
        print('     -> Confirm each sample size:', tempL)
    ##Visualization
    sns.set(style='ticks', font='Arial', context='talk')
    p = sns.relplot(kind='line', data=tempDF1, x='days_in_program', y='PredictedChange',
                    row='BaseBMI_class', row_order=tempL1,
                    hue='LMM', hue_order=tempD.keys(), palette=tempD, estimator='mean', ci=95,
                    height=3, aspect=4.5/3+0.125, legend=False)#Manual adjustment
    p.set(xlim=(0, 365.25/12*12), xticks=np.arange(0, 365.25/12*12+1, 90),
          ylim=(-7, 1), yticks=np.arange(-6, 0.1, 2))
    tempL = [bmi_class+' class' for bmi_class in tempL1]
    for ax, ax_title in zip(p.axes.ravel(), tempL):
        #Draw initial line
        ax.axhline(y=0, **{'linestyle':'--', 'color':'k'}, zorder=0)
        #Draw span b/w knots
        #for knot in range(nknots-1):
        #    if knot%2 == 1:
        #        ax.axvspan(xmin=365.25/12*month_threshold/nknots*knot,
        #                   xmax=365.25/12*month_threshold/nknots*(knot+1),
        #                   facecolor='gray', alpha=0.2, zorder=-1)
        #Change facet label
        ax.set_title(ax_title, {'fontsize':'large'})
    ###Reset and generate common axis title
    for row_i, ax in enumerate(p.axes.ravel()):
        if row_i == 2:
            ax.set_xlabel('Days in program')
            ax.set_ylabel('')
        elif row_i == 1:
            ax.set_xlabel('')
            ax.set_ylabel('Estimated change [%]')
        else:
            ax.set_xlabel('')
            ax.set_ylabel('')
    plt.tight_layout()
    ###Generate standard Matplotlib legend manually
    tempL = []
    for yvar in tempD.keys():
        tempL.append(mlines.Line2D([], [], color=tempD[yvar], label=yvar, linewidth=3))
    plt.legend(handles=tempL, fontsize='medium', title='LMM estimate', title_fontsize='large',
               bbox_to_anchor=(0.5, 0), loc='upper center', ncol=2, borderaxespad=4,
               handlelength=1.5, handletextpad=0.5, columnspacing=1.0, frameon=True)
    ###Save
    fileDir = './ExportFigures/'
    ipynbName = '220806_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LMM-ver2_'
    fileName = 'baselineBMIclass-'+tempD2[model_sex]+'.tif'
    plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                      pil_kwargs={'compression':'tiff_lzw'})
    plt.show()
    
    print('')

## 6. Misclassification (vs. MetBMI class)

In [None]:
vs_bbmi = 'MetBMI'

### 6-1. Prepare misclassification

In [None]:
#Prepare misclassification
tempD = {'Sex-mixed '+vs_bbmi+' model':'BothSex', 'Sex-stratified '+vs_bbmi+' model':'FemaleMale'}
tempD1 = {}
for model_sex in tempD.keys():
    #Import time-series biological BMI
    fileDir = './ExportData/'
    ipynbName = '220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_'
    fileName = vs_bbmi+'-'+tempD[model_sex]+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    
    #Retrieve baseline BMI/bBMI class
    tempL = ['public_client_id', 'BaseBMI_class', 'Base'+vs_bbmi+'_class']
    tempDF = tempDF[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = tempDF.set_index('public_client_id')
    
    #Misclassification
    tempL = []
    for row_i in range(len(tempDF)):
        bmi_class = tempDF['BaseBMI_class'].iloc[row_i]
        vs_bbmi_class = tempDF['Base'+vs_bbmi+'_class'].iloc[row_i]
        if bmi_class==vs_bbmi_class:
            tempL.append('Matched')
        else:
            tempL.append('Mismatched')
    tempDF['vs_Base'+vs_bbmi+'_class'] = tempL
    
    tempD1[model_sex] = tempDF
    
    #Check
    print(model_sex)
    display(tempDF)
    tempS1 = tempDF['BaseBMI_class'].value_counts()
    tempS2 = tempDF['Base'+vs_bbmi+'_class'].value_counts()
    tempDF1 = pd.merge(tempS1, tempS2, left_index=True, right_index=True, how='outer')
    display(tempDF1)
    tempS = tempDF.groupby(by='BaseBMI_class')['vs_Base'+vs_bbmi+'_class'].value_counts()
    display(tempS)
    print('')

misclassDF_BS = tempD1['Sex-mixed '+vs_bbmi+' model']
misclassDF_FM = tempD1['Sex-stratified '+vs_bbmi+' model']

### 6-2. Metabolomics

> Repeat the above code, while adding the misclassification info.  

In [None]:
yvar = 'MetBMI'
yvar_color = 'b'

#### 6-2-1. Prepare the time-series DF

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':'BothSex', 'Sex-stratified '+yvar+' model':'FemaleMale'}
tempD2 = {'Sex-mixed '+yvar+' model':misclassDF_BS, 'Sex-stratified '+yvar+' model':misclassDF_FM}
tempD3 = {}
for model_sex in tempD1.keys():
    #Import time-series biological BMI
    fileDir = './ExportData/'
    ipynbName = '220805_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LASSO_'
    fileName = yvar+'-'+tempD1[model_sex]+'.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('KeyIndex')
    
    #Add the misclassification
    tempDF1 = tempD2[model_sex]
    tempDF = pd.merge(tempDF, tempDF1['vs_Base'+vs_bbmi+'_class'],
                      left_on='public_client_id', right_index=True, how='left')
    
    tempD3[model_sex] = tempDF
    
    print(model_sex)
    display(tempDF)
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    print('')

tsDF_BS = tempD3['Sex-mixed '+yvar+' model']
tsDF_FM = tempD3['Sex-stratified '+yvar+' model']

#### 6-2-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    #Sort to make 'Matched' the standard (bcoef = 0) in LMM (Matched < Mismatched)
    tempDF = tempDF.sort_values(by=['vs_Base'+vs_bbmi+'_class'], ascending=True)
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
scaler_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']
scaler_FM = tempD2['Sex-stratified '+yvar+' model']

#### 6-2-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM5':'Normal class LMM',
              'LMM6':'Obese class LMM'}
    tempD3 = {'LMM5':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM6':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'\
        ' + C(vs_Base'+vs_bbmi+'_class) + DaysZ:C(vs_Base'+vs_bbmi+'_class)'\
        ' + DaysP2Z:C(vs_Base'+vs_bbmi+'_class) + DaysP3Z:C(vs_Base'+vs_bbmi+'_class)'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM5_BS = tempD1['Sex-mixed '+yvar+' model - LMM5']
LMM6_BS = tempD1['Sex-mixed '+yvar+' model - LMM6']
LMM5_FM = tempD1['Sex-stratified '+yvar+' model - LMM5']
LMM6_FM = tempD1['Sex-stratified '+yvar+' model - LMM6']

#### 6-2-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':scaler_BS, 'Sex-stratified '+yvar+' model':scaler_FM}
tempD3 = {'Sex-mixed '+yvar+' model':LMM5_BS, 'Sex-stratified '+yvar+' model':LMM5_FM}
tempD4 = {'Sex-mixed '+yvar+' model':LMM6_BS, 'Sex-stratified '+yvar+' model':LMM6_FM}
tempD3 = {'Normal':tempD3, 'Obese':tempD4}
tempD4 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
             'BaseBMI_class', 'vs_Base'+vs_bbmi+'_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 5-6 prediction
    for model_i, bmi_class in enumerate(tempD3.keys(), start=5):
        tempD = tempD3[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted5':'mean',
                         yvar+'_predicted6':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the dropped covariates and save the prediction DF to use later
    tempDF1 = tempD1[model_sex]
    tempL = tempDF1.loc[:, tempDF1.columns.str.contains('BMI_class')].columns.tolist()
    tempL = [col_n for sublist in [['public_client_id', 'Sex'], tempL] for col_n in sublist]
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD4[model_sex] = tempDF

metLMM_BS = tempD4['Sex-mixed '+yvar+' model']
metLMM_FM = tempD4['Sex-stratified '+yvar+' model']

In [None]:
#Check visually
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':metLMM_BS, 'Sex-stratified '+yvar+' model':metLMM_FM}
tempD3 = {'Model 5':'Normal', 'Model 6':'Obese'}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot
    for model_i, model_n in enumerate(tempD3.keys(), start=5):
        bmi_class = tempD3[model_n]
        print(' - '+model_n+': Baseline '+bmi_class+' class-stratified LMM')
        tempD = {'Matched':'tab:blue', 'Mismatched':'tab:orange'}
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(5, 3))
        sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']==bmi_class],
                        x='days_in_program', y=yvar,
                        hue='vs_Base'+vs_bbmi+'_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
        p = sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                         x='days_in_program', y=yvar+'_predicted'+str(model_i),
                         hue='vs_Base'+vs_bbmi+'_class', hue_order=tempD.keys(), palette=tempD,
                         estimator='mean', ci=95)
        p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
              ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
        sns.despine()
        plt.xlabel('Days in program')
        plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
        plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
        plt.show()
        print('')
    
    print('')

### 6-3. Measured BMI

> Repeat the above code, while adding the misclassification info. Because the misclassification info is dependent on sex-mixed or sex-stratified model for bBMI, it is required to be devided by cases.  

In [None]:
yvar = 'BMI'
yvar_color = 'k'

#### 6-3-1. Prepare the time-series DF

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':misclassDF_BS, 'Sex-stratified '+yvar+' model':misclassDF_FM}
tempD2 = {}
for model_sex in tempD1.keys():
    #Import time-series BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF = tempDF.set_index('KeyIndex')
    ##Chage column name
    tempDF = tempDF.rename(columns={'BMI_CALC':'BMI'})
    
    #Add obesity classification (based on BMI)
    tempDF['BaseBMI'] = np.e**tempDF['log_BaseBMI']
    tempDF = tempDF.drop(columns=['log_BaseBMI'])
    ##-> No need to re-calculate this time! Utilize the following misclassification DF
    
    #Add the misclassification
    tempDF1 = tempD1[model_sex]
    tempDF = pd.merge(tempDF, tempDF1, left_on='public_client_id', right_index=True, how='left')
    
    tempD2[model_sex] = tempDF
    
    print(model_sex)
    display(tempDF)
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    print('')

tsDF_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD2['Sex-stratified '+yvar+' model']

#### 6-3-2. Prepare DFs for LMM with the linear regresion spline for time

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
tempD2 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    print('Time-series DF nrows before filtering:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select measurements for the longitudinal analysis
    month_threshold = 18
    tempDF = tempDF.loc[tempDF['days_in_program'] <= 365.25/12*month_threshold]
    print('Time-series DF nrows after filtering with the taget period:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the cohort for the longitudinal analysis
    ##Select participants who have 2 or more measuremnts in BMI
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-bmiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL1 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in omics
    fileDir = '../210104_Biological-BMI-paper/ExportData/'
    ipynbName = '210104_Biological-BMI-paper_data-cleaning-BMI-omics_'
    fileName = 'time-series-combiDF-without-imputation_final-cohort.tsv'
    tempDF1 = pd.read_csv(fileDir+ipynbName+fileName, sep='\t', dtype={'public_client_id': str})
    tempDF1 = tempDF1.set_index('KeyIndex')
    tempDF1 = tempDF1.loc[tempDF1['days_in_program'] <= 365.25/12*month_threshold]
    tempS = tempDF1['public_client_id'].value_counts()
    tempL2 = tempS.loc[tempS>=2].index.tolist()
    ##Select participants who have 2 or more measuremnts in both BMI and omics
    tempL = list(set(tempL1) & set(tempL2))
    print('Participants who have 2 or more measurements in both BMI and omics:', len(tempL))
    tempDF = tempDF.loc[tempDF['public_client_id'].isin(tempL)]
    print('Time-series DF nrows after filtering with the number of measurements:',
          len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Select the participants who have all covariates (just in case)
    tempDF = tempDF.dropna()
    print('Time-series DF nrows after filtering with covariates:', len(tempDF))
    print(' -> Unique ID:', len(tempDF['public_client_id'].unique()))
    
    #Check measurement distribution
    sns.set(style='ticks', font='Arial', context='talk')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['days_in_program'], color=yvar_color)
    sns.despine()
    for knot in [0, 6, 12, 18]:
        plt.axvline(x=365.25/12*knot, **{'linestyle':'--', 'color':'gray'})
    plt.axvspan(xmin=365.25/12*18, xmax=plt.gca().get_xlim()[1], facecolor='gray', alpha=0.2)
    plt.ylabel('Density')
    plt.xlabel('Days in program')
    plt.show()
    
    #Check dependent variable distribution
    print('Skewness of the dependent variable:', stats.skew(tempDF['log_'+yvar]))
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    sns.distplot(tempDF['log_'+yvar], color=yvar_color)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(yvar+' (log-scale)')
    plt.show()
    
    #Add dummy variables for the linear regression spline for time
    nknots = 3
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piece
    tempL2 = []#3rd piece
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    
    #Standardize numeric independent variables
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    scaler.fit(tempDF[tempL])
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Confirm examples
    sns.set(style='ticks', font='Arial', context='notebook')
    plt.figure(figsize=(4, 3))
    for variable in ['DaysZ', 'BaseAgeZ', 'PC1Z']:
        sns.distplot(tempDF[variable], label=variable)
    sns.despine()
    plt.ylabel('Density')
    plt.xlabel(r'$Z$'+'-score')
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempS = (pd.Series(tempL) - scaler.mean_[0]) / scaler.scale_[0]#Z-score transformation
    for knot in tempS.tolist():
        plt.axvline(x=knot, **{'linestyle':'--', 'color':'gray'})
    plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
    plt.show()
    
    #Sort to make 'Matched' the standard (bcoef = 0) in LMM (Matched < Mismatched)
    tempDF = tempDF.sort_values(by=['vs_Base'+vs_bbmi+'_class'], ascending=True)
    
    display(tempDF)
    print('')
    
    tempD1[model_sex] = tempDF
    tempD2[model_sex] = scaler#Use later again during predictions

#Update
tsDF_BS = tempD1['Sex-mixed '+yvar+' model']
scaler_BS = tempD2['Sex-mixed '+yvar+' model']
tsDF_FM = tempD1['Sex-stratified '+yvar+' model']
scaler_FM = tempD2['Sex-stratified '+yvar+' model']

#### 6-3-3. LMM with random intercepts and random slopes for time

> Days in the program is fitted as a numeric variable using linear regression splines (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows differences in the trajectory of BMI/bBMI change.***
>
> Random intercepts and random slopes for days in the program are used as random effects (cf. Zubair, N. et al. Sci. Rep. 2019).  
> ***–> Allows individual differences in the baseline and longitudinal change over the program.***  
> –> Random slope is applied not for the piecewise variable of time but only for the overall variable of time; otherwise, leads to "LinAlgError: Singular matrix" due to small sample size per most individuals. Also, it is not so weird to assume the random slope is consistent during the program.  
>
> ***Eliminate "Underweight" class in the baseline obesity class-stratified models.***  
> <– Otherwise, leads to "LinAlgError: Singular matrix" due to small sample population.  

In [None]:
tempD = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD1 = {}
for model_sex in tempD.keys():
    print(model_sex)
    tempDF = tempD[model_sex]
    
    tempD2 = {'LMM5':'Normal class LMM',
              'LMM6':'Obese class LMM'}
    tempD3 = {'LMM5':tempDF.loc[tempDF['BaseBMI_class']=='Normal'],
              'LMM6':tempDF.loc[tempDF['BaseBMI_class']=='Obese']}
    fe_formula = 'log_'+yvar+' ~ DaysZ + DaysP2Z + DaysP3Z'\
        ' + C(Sex) + BaseAgeZ + C(Season) + PC1Z + PC2Z + PC3Z + PC4Z + PC5Z'\
        ' + C(vs_Base'+vs_bbmi+'_class) + DaysZ:C(vs_Base'+vs_bbmi+'_class)'\
        ' + DaysP2Z:C(vs_Base'+vs_bbmi+'_class) + DaysP3Z:C(vs_Base'+vs_bbmi+'_class)'
    for model_n in tempD2.keys():
        print(' - '+tempD2[model_n])
        #Fit LMM
        model = sm.MixedLM.from_formula(formula=fe_formula, data=tempD3[model_n],
                                        groups='public_client_id',
                                        re_formula='DaysZ')#Random intercepts for each group are included as default
        t_start = time.time()
        #fit_res = model.fit(method=['bfgs', 'lbfgs', 'cg', 'powell', 'nm'])#Back up for failure of convergence
        fit_res = model.fit(method=['powell'])#After checking convergence for all models, fix the method through the notebook
        t_elapsed = time.time() - t_start
        print('    - Elapsed time for fitting LMM:', round(t_elapsed//60), 'min', round(t_elapsed%60, 1), 'sec')
        
        tempD1[model_sex+' - '+model_n] = fit_res
        
        #Check the fitted models
        display(fit_res.summary())
        print('(Note that the estimates for fixed effects and the variances for random effects are shown in the single table.)')
        ##Visualize b-coefs and g-coefs
        tempDF1 = pd.DataFrame({'Coef':fit_res.params,
                                'Coef_ci_l':fit_res.conf_int(alpha=0.05).iloc[:, 0],
                                'Coef_ci_h':fit_res.conf_int(alpha=0.05).iloc[:, 1],
                                'Pval':fit_res.pvalues})
        tempDF1['Coef_ci'] = (tempDF1['Coef_ci_h'] - tempDF1['Coef_ci_l'])/2
        tempDF1 = tempDF1.reset_index()
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(4, 0.25*len(tempDF1)))
        plt.errorbar(x=tempDF1['Coef'], y=tempDF1['index'], xerr=tempDF1['Coef_ci'],
                     fmt='ok', ecolor='k', capsize=5)
        plt.xlim(-0.1, 0.1)#Clip
        sns.despine()
        plt.xlabel(r'$\beta$'+' or '+r'$\gamma$'+' coefficient in LMM model\n(Mean with 95% CI)')
        plt.ylabel('')
        plt.axvline(x=0, **{'linestyle':'--', 'color':'k'})
        for row_i in range(len(tempDF1)):
            if tempDF1['Pval'].iloc[row_i]<0.05:
                plt.axhspan(ymin=row_i-0.5, ymax=row_i+0.5, facecolor=yvar_color, alpha=0.2, zorder=0)
        plt.gca().invert_yaxis()
        plt.margins(y=0.01, tight=True)
        plt.show()
        print('')
    print('')

LMM5_BS = tempD1['Sex-mixed '+yvar+' model - LMM5']
LMM6_BS = tempD1['Sex-mixed '+yvar+' model - LMM6']
LMM5_FM = tempD1['Sex-stratified '+yvar+' model - LMM5']
LMM6_FM = tempD1['Sex-stratified '+yvar+' model - LMM6']

#### 6-3-4. Estimated transition of biological BMI

> The days in program of the fitted values were different between individuals.  
> –> In statsmodels, MixedLMResults.predict() method use only the fixed effects parameters for prediction.  
> ***–> To evaluate the variability in the in-sample population, the rondom effects component should be added manually.***

In [None]:
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':scaler_BS, 'Sex-stratified '+yvar+' model':scaler_FM}
tempD3 = {'Sex-mixed '+yvar+' model':LMM5_BS, 'Sex-stratified '+yvar+' model':LMM5_FM}
tempD4 = {'Sex-mixed '+yvar+' model':LMM6_BS, 'Sex-stratified '+yvar+' model':LMM6_FM}
tempD3 = {'Normal':tempD3, 'Obese':tempD4}
tempD4 = {}
for model_sex in tempD1.keys():
    #Prepare DF to impute days_in_program but maintain in-sample population
    tempDF = tempD1[model_sex]
    tempA1 = tempDF['public_client_id'].unique()
    tempA2 = np.arange(0, 365.25/12*month_threshold + 1, 365.25/12*month_threshold/nknots)
    tempL = ['Spring', 'Summer', 'Autumn', 'Winter']
    tempDF = pd.DataFrame(data={'public_client_id':np.repeat(tempA1, len(tempA2)*len(tempL)),
                                'days_in_program':np.tile(np.repeat(tempA2, len(tempL)), len(tempA1)),
                                'Season':np.tile(tempL, len(tempA1)*len(tempA2))})
    ##Add the covariates unique to the in-sample individuals
    tempDF1 = tempD1[model_sex]
    tempL = ['public_client_id', 'Sex', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
             'BaseBMI_class', 'vs_Base'+vs_bbmi+'_class']
    tempDF1 = tempDF1[tempL]
    tempDF1 = tempDF1.drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    ##Add dummy variables for the linear regression spline for time
    tempL = [365.25/12 * month_threshold/nknots * (knot+1) for knot in range(nknots)]#Days of knots
    tempL1 = []#2nd piecewise
    tempL2 = []#3rd piecewise
    for days in tempDF['days_in_program'].tolist():
        if days < tempL[0]:#[0 month, 6 months)
            tempL1.append(0.0)
            tempL2.append(0.0)
        elif days < tempL[1]:#[6 months, 12 months)
            tempL1.append(days - tempL[0])
            tempL2.append(0.0)
        elif days <= tempL[2]:#[12 months, 18 months]
            tempL1.append(days - tempL[0])
            tempL2.append(days - tempL[1])
        else:
            tempL1.append('Error? Check time span setting.')
            tempL2.append('Error? Check time span setting.')
    tempDF['DaysP2'] = tempL1
    tempDF['DaysP3'] = tempL2
    ##Add a dummy index to use when merging later
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    ##Standardize numeric independent variables using the already fitted StandardScaler
    scaler = tempD2[model_sex]
    tempL = ['days_in_program', 'DaysP2', 'DaysP3', 'BaseAge', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    tempDF1 = pd.DataFrame(scaler.transform(tempDF[tempL]), index=tempDF.index,
                           columns=['DaysZ', 'DaysP2Z', 'DaysP3Z', 'BaseAgeZ', 'PC1Z', 'PC2Z', 'PC3Z', 'PC4Z', 'PC5Z'])
    tempDF = pd.merge(tempDF, tempDF1, left_index=True, right_index=True, how='inner')
    
    #Model 5-6 prediction
    for model_i, bmi_class in enumerate(tempD3.keys(), start=5):
        tempD = tempD3[bmi_class]
        fit_res = tempD[model_sex]
        ##Prepare the target class
        tempDF1 = tempDF.loc[tempDF['BaseBMI_class']==bmi_class]
        ##Predicted component from the fixed effects
        tempS = fit_res.predict(tempDF1)#Automatically recognize the column name in formula
        ##Create design matrix for the random effects
        tempDF2 = pd.DataFrame({'Intercept_re':np.repeat(1, len(tempDF1)),
                                'Slope_re':tempDF1['DaysZ']})
        tempD = fit_res.random_effects#Dictionary of key=group label and value=pd.Series
        ##Predicted component from the random effects
        tempL = [np.dot(tempDF2.iloc[row_i], tempD[group_n])
                 for (row_i, group_n) in enumerate(tempDF1['public_client_id'].tolist())]
        ##Total prediction for in-sample individual
        tempS = tempS + tempL
        ##Convert to original scale
        tempDF1[yvar+'_predicted'+str(model_i)] = np.e**tempS
        ##Add the predictions to the working DF
        tempDF = pd.merge(tempDF, tempDF1[yvar+'_predicted'+str(model_i)],
                          left_index=True, right_index=True, how='outer')#NaN for out-of-class
    
    #Flatten effects of the dummy season variable
    tempDF = tempDF.groupby(by=['public_client_id', 'days_in_program'])
    tempDF = tempDF.agg({yvar+'_predicted5':'mean',
                         yvar+'_predicted6':'mean'})
    tempDF = tempDF.reset_index()
    ##Recover the dropped covariates and save the prediction DF to use later
    tempDF1 = tempD1[model_sex]
    tempL = tempDF1.loc[:, tempDF1.columns.str.contains('BMI_class')].columns.tolist()
    tempL = [col_n for sublist in [['public_client_id', 'Sex'], tempL] for col_n in sublist]
    tempDF1 = tempDF1[tempL].drop_duplicates('public_client_id', keep='first')
    tempDF = pd.merge(tempDF, tempDF1, on='public_client_id', how='left')
    tempDF['KeyIndex'] = tempDF['public_client_id'] + '_day' + tempDF['days_in_program'].astype(str)
    tempDF = tempDF.set_index('KeyIndex')
    
    print(model_sex)
    display(tempDF)
    display(tempDF.describe(include='all'))
    
    tempD4[model_sex] = tempDF

measLMM_BS = tempD4['Sex-mixed '+yvar+' model']
measLMM_FM = tempD4['Sex-stratified '+yvar+' model']

In [None]:
#Check visually
tempD1 = {'Sex-mixed '+yvar+' model':tsDF_BS, 'Sex-stratified '+yvar+' model':tsDF_FM}
tempD2 = {'Sex-mixed '+yvar+' model':measLMM_BS, 'Sex-stratified '+yvar+' model':measLMM_FM}
tempD3 = {'Model 5':'Normal', 'Model 6':'Obese'}
for model_sex in tempD1.keys():
    print(model_sex)
    tempDF1 = tempD1[model_sex]
    tempDF2 = tempD2[model_sex]
    
    #Plot
    for model_i, model_n in enumerate(tempD3.keys(), start=5):
        bmi_class = tempD3[model_n]
        print(' - '+model_n+': Baseline '+bmi_class+' class-stratified LMM')
        tempD = {'Matched':'tab:blue', 'Mismatched':'tab:orange'}
        sns.set(style='ticks', font='Arial', context='notebook')
        plt.figure(figsize=(5, 3))
        sns.scatterplot(data=tempDF1[tempDF1['BaseBMI_class']==bmi_class],
                        x='days_in_program', y=yvar,
                        hue='vs_Base'+vs_bbmi+'_class', hue_order=tempD.keys(), palette=tempD,
                        edgecolor='k', s=20, alpha=0.3)
        p = sns.lineplot(data=tempDF2[tempDF2['BaseBMI_class']==bmi_class],
                         x='days_in_program', y=yvar+'_predicted'+str(model_i),
                         hue='vs_Base'+vs_bbmi+'_class', hue_order=tempD.keys(), palette=tempD,
                         estimator='mean', ci=95)
        p.set(xlim=(0, 365.25/12*month_threshold), xticks=np.arange(0, 365.25/12*month_threshold, 100),
              ylim=(17.5, 42.5), yticks=np.arange(20, 40.1, 5))
        sns.despine()
        plt.xlabel('Days in program')
        plt.ylabel(yvar+' [kg/m'+r'$^2$'+']')
        plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', borderaxespad=1)
        plt.show()
        print('')
    
    print('')

### 6-4. Comparison of estimates

In [None]:
#For presentation
tempD1 = {'BMI':'k', 'MetBMI':'b'}
tempD2 = {'Matched':'tab:blue', 'Mismatched':'tab:orange'}
tempL1 = [measLMM_BS, metLMM_BS]
tempL2 = [measLMM_FM, metLMM_FM]
tempD3 = {'Sex-mixed bBMI model':tempL1, 'Sex-stratified bBMI model':tempL2}
tempD4 = {'Model 5':'Normal', 'Model 6':'Obese'}
tempD5 = {'Sex-mixed bBMI model':'BothSex', 'Sex-stratified bBMI model':'FemaleMale'}

for model_sex in tempD3.keys():
    print(model_sex)
    
    #Merge prediction DFs
    tempL = tempD3[model_sex]
    tempDF1 = tempL[0]
    tempDF2 = tempL[1]
    tempDF2 = tempDF2.loc[:, ~tempDF2.columns.isin(tempDF1.columns.tolist())]
    tempDF = pd.merge(tempDF1, tempDF2, left_index=True, right_index=True, how='outer')
    
    #Calculate longitudinal change
    tempL = tempDF.loc[:, tempDF.columns.str.contains('_predicted')].columns.tolist()
    for col_n in tempL:
        tempDF1 = tempDF[['public_client_id', 'days_in_program', col_n]].dropna()
        #Retrieve the estimate at days = 0 for each participant
        tempDF2 = tempDF1.loc[tempDF1['days_in_program']==0.0]
        tempS = tempDF2.reset_index().set_index('public_client_id')[col_n]
        tempS.name = 'day0'
        #Merge
        tempDF1 = pd.merge(tempDF1, tempS, left_on='public_client_id', right_index=True, how='left')
        #Calculate longitudinal change [%]
        label = col_n.replace('_predicted', 'change_predicted')
        tempDF1[label] = (tempDF1[col_n] - tempDF1['day0']) / tempDF1['day0'] * 100
        #Add to the working DF
        tempDF = pd.merge(tempDF, tempDF1[label], left_index=True, right_index=True, how='left')
    
    #Model 5-6
    for model_i, model_n in enumerate(tempD4.keys(), start=5):
        bmi_class = tempD4[model_n]
        print(' - '+model_n+': Baseline '+bmi_class+' class-stratified LMM')
        #Prepare DF
        tempL = tempDF.loc[:, tempDF.columns.str.contains('change_predicted'+str(model_i))].columns.tolist()
        tempDF1 = tempDF.reset_index().melt(var_name='LMM', value_name='PredictedChange', value_vars=tempL,
                                            id_vars=['public_client_id', 'days_in_program',
                                                     'BaseBMI_class', 'vs_Base'+vs_bbmi+'_class'])
        tempDF1 = tempDF1.dropna()
        tempDF1['LMM'] = tempDF1['LMM'].str.replace('change_predicted'+str(model_i), '')
        #Check sample size
        for yvar in tempD1.keys():
            tempDF2 = tempDF1.loc[tempDF1['LMM']==yvar]
            print('   - '+yvar+': n =', len(tempDF2['public_client_id'].unique()))
            tempL = []
            for misclass in tempD2.keys():
                tempA = tempDF2['public_client_id'].loc[tempDF2['vs_Base'+vs_bbmi+'_class']==misclass].unique()
                tempL.append(misclass+' (n = '+str(len(tempA))+')')
            print('     -> Confirm each sample size:', tempL)
        #Visualization
        sns.set(style='ticks', font='Arial', context='talk')
        p = sns.relplot(kind='line', data=tempDF1, x='days_in_program', y='PredictedChange',
                        row='LMM', row_order=tempD1.keys(),
                        hue='vs_Base'+vs_bbmi+'_class', hue_order=tempD2.keys(), palette=tempD2,
                        estimator='mean', ci=95, height=3.15, aspect=4.5/3-0.025, legend=False)#Manual adjustment
        if bmi_class=='Normal':
            p.set(xlim=(0, 365.25/12*12), xticks=np.arange(0, 365.25/12*12+1, 90),
                  ylim=(-8.5, 1), yticks=np.arange(-8, 0.1, 2))
        elif bmi_class=='Obese':
            p.set(xlim=(0, 365.25/12*12), xticks=np.arange(0, 365.25/12*12+1, 90),
                  ylim=(-8.5, 1), yticks=np.arange(-8, 0.1, 2))
        tempL = ['LMM estimate: '+estimate for estimate in tempD1.keys()]
        for ax, ax_title in zip(p.axes.ravel(), tempL):
            #Draw initial line
            ax.axhline(y=0, **{'linestyle':'--', 'color':'k'}, zorder=0)
            #Draw span b/w knots
            #for knot in range(nknots-1):
            #    if knot%2 == 1:
            #        ax.axvspan(xmin=365.25/12*month_threshold/nknots*knot,
            #                   xmax=365.25/12*month_threshold/nknots*(knot+1),
            #                   facecolor='gray', alpha=0.2, zorder=-1)
            #Change facet label
            ax.set_title(ax_title, {'fontsize':'large'})
        ##Reset and generate common axis title
        for row_i, ax in enumerate(p.axes.ravel()):
            if row_i == 1:
                ax.set_xlabel('Days in program')
                ax.set_ylabel('')
            else:
                ax.set_xlabel('')
                ax.set_ylabel('')
        p.fig.text(x=0, y=0.5, s='Estimated change [%]', fontsize='medium',
                   verticalalignment='center', horizontalalignment='left', rotation='vertical')
        plt.tight_layout()
        ##Generate standard Matplotlib legend manually
        tempL = []
        for misclass in tempD2.keys():
            tempL.append(mlines.Line2D([], [], color=tempD2[misclass], label=misclass, linewidth=3))
        plt.legend(handles=tempL, fontsize='medium',
                   title='Baseline BMI class\n(vs. '+vs_bbmi+' class)', title_fontsize='large',
                   bbox_to_anchor=(0.5, 0), loc='upper center', ncol=2, borderaxespad=4,
                   handlelength=1.5, handletextpad=0.5, columnspacing=1.0, frameon=True)
        ##Save
        fileDir = './ExportFigures/'
        ipynbName = '220806_Multiomics-BMI-NatMed1stRevision_BMI-longitudinal-LMM-ver2_'
        fileName = 'baseline'+bmi_class+'class(vs'+vs_bbmi+')-'+tempD5[model_sex]+'.tif'
        plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04,
                          pil_kwargs={'compression':'tiff_lzw'})
        plt.show()
        print('')
    print('')

# — End of this notebook —