# Arivale individual metabolite abundance analyses

Inputs: Arivale_metabolomics_metadata.csv; Arivale_preprocecessed_indiv_metab_analysis_data.csv  
Outputs: Data organized into Figure 2; Supplementary Files 1 & 2; Supplementary Table 3

'Save' lines are commented out  
'Healthy' and 'Unhealthy' are renamed as 'Bio_Young' and 'Bio_Old' in figures and data files

In [1]:
import pandas as pd
from IPython.display import display
# get the library
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sp
import seaborn as sns
from matplotlib import pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.sandbox.stats.multicomp
from statsmodels.genmod.families import family, links
from analytics.util.analytics_logger import GetAnalyticsLogger
import logging
logger = GetAnalyticsLogger()
logger.setLevel(logging.INFO)

In [2]:
met_met = pd.read_csv('/notebooks/0. APOE-Multiomics/Data_Files/Arivale_metabolomics_metadata.csv')

In [3]:
df = pd.read_csv('/notebooks/0. APOE-Multiomics/Data_Files/Arivale_preprocecessed_indiv_metab_analysis_data.csv')

In [4]:
analytes_list = df.columns.to_list()[13:-7]

In [5]:
# assign CA tertiles
df['CA_tertile'] = pd.qcut(df['age'], q=3, labels = ['bottom','middle','top'])

# glm of analytes, APOE

In [6]:
def glm_analytes_apoe(analyte_list, dat):

    results = []

    for (col1) in analyte_list:

        # Default is gaussian
        family_type = family.Gaussian()
        family_type.link = links.identity()
        family_name = 'Gaussian'
        family_link = 'Identity'

        # Covariance structure
        cov = sm.cov_struct.Exchangeable()

        sub = dat[[col1, 'age', 'sex', 'BMI_CALC','comb_del_age','APOE_Status','meds_cholesterol','PC1','PC2']].copy() # 'meds_blood_sugar', 'meds_blood_pressure', 'PC3', 'PC4'
        sub.dropna(subset = [col1], inplace=True)
        sub.rename(columns={col1:'module'}, inplace=True)

        try:
            ols_model = 'module ~ C(APOE_Status, Treatment(reference=1)) + age + C(sex) + BMI_CALC + meds_cholesterol + PC1 + PC2' # + Education'
            fitted_model = smf.glm(ols_model, data=sub, family=family_type, missing='drop').fit(maxiter=2000)
            results.append((col1, len(fitted_model.fittedvalues), fitted_model.converged, *fitted_model.params, *fitted_model.pvalues))

        except Exception as e:
            logger.info('Failed module {} with error {}'.format(col1, str(e)))
            skipped += 1
      
    df = pd.DataFrame(results, columns=['col1', 'n', 'converged', *fitted_model.params.index, *[str(x)+'_p' for x in fitted_model.pvalues.index]])
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E2]_p'].isnull(), 'C(APOE_Status, Treatment(reference=1))[T.E2]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E2]_p'].isnull(), 'E2_pval_adj'] = adj_pval
    df.sort_values(['E2_pval_adj'], ascending=True, inplace=True)
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E4]_p'].isnull(), 'C(APOE_Status, Treatment(reference=1))[T.E4]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E4]_p'].isnull(), 'E4_pval_adj'] = adj_pval
    df.sort_values(['E4_pval_adj'], ascending=True, inplace=True)
    
    return df

In [7]:
apoe_regress = glm_analytes_apoe(analytes_list, df[(df.APOE_Status != 'Not Considered')])

In [8]:
apoe_regress

Unnamed: 0,col1,n,converged,Intercept,"C(APOE_Status, Treatment(reference=1))[T.E2]","C(APOE_Status, Treatment(reference=1))[T.E4]",C(sex)[T.M],meds_cholesterol[T.Yes],age,BMI_CALC,...,"C(APOE_Status, Treatment(reference=1))[T.E2]_p","C(APOE_Status, Treatment(reference=1))[T.E4]_p",C(sex)[T.M]_p,meds_cholesterol[T.Yes]_p,age_p,BMI_CALC_p,PC1_p,PC2_p,E2_pval_adj,E4_pval_adj
853,oleoyl-linoleoyl-glycerol (18:1/18:2) [1],1900,True,-1.330949,0.208088,0.156428,0.361420,0.053366,0.004173,0.033108,...,0.003785,0.003220,3.305648e-15,4.433087e-01,4.473502e-02,8.247448e-22,2.616034e-04,0.000007,0.135646,0.514003
883,linoleoyl-arachidonoyl-glycerol (18:2/20:4) [1]*,1900,True,-1.291201,0.312449,0.150306,0.235085,0.354161,-0.002489,0.042869,...,0.000007,0.003445,1.186827e-07,1.463035e-07,2.160906e-01,9.784475e-38,4.646325e-02,0.009280,0.001576,0.514003
187,oleoyl-arachidonoyl-glycerol (18:1/20:4) [1]*,1900,True,-1.285031,0.303696,0.147611,0.241346,0.450185,0.003927,0.031020,...,0.000017,0.004723,8.925378e-08,4.934868e-11,5.487482e-02,6.319797e-20,6.240386e-02,0.107562,0.003129,0.514003
438,linoleoyl-linolenoyl-glycerol (18:2/18:3) [2]*,1900,True,-1.043684,0.105942,0.153395,0.232383,0.079216,0.000709,0.030677,...,0.145563,0.004345,5.696667e-07,2.612494e-01,7.361927e-01,1.653768e-18,4.954279e-02,0.006929,0.696624,0.514003
430,X - 17676,1900,True,-1.017227,0.010062,0.152939,0.111002,-0.017733,0.013312,0.010177,...,0.886579,0.003345,1.369686e-02,7.952364e-01,6.911767e-11,2.653565e-03,3.941463e-32,0.672505,0.959438,0.514003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"5alpha-pregnan-3beta,20alpha-diol monosulfate (2)",1900,True,1.713638,0.022573,-0.000407,-0.468392,-0.091049,-0.021740,-0.017307,...,0.747340,0.993733,1.158443e-25,1.797765e-01,7.794637e-27,2.672057e-07,1.869145e-03,0.529415,0.932356,0.995254
522,phosphoethanolamine,1900,True,0.831458,0.145188,0.000400,-0.143489,-0.036691,-0.015285,-0.001783,...,0.048413,0.994129,2.243985e-03,6.066053e-01,6.843496e-13,6.135528e-01,4.430832e-01,0.993223,0.510327,0.995254
808,gamma-glutamylthreonine,1900,True,-0.440375,0.095230,0.000679,0.220801,0.010297,0.005355,0.003020,...,0.198144,0.990091,2.949292e-06,8.857747e-01,1.237561e-02,3.952319e-01,9.106600e-05,0.057213,0.768559,0.995254
132,2-arachidonoyl-GPC (20:4)*,1900,True,0.314198,0.104034,-0.001305,0.313201,0.277178,0.002015,-0.020975,...,0.150117,0.980514,1.141082e-11,7.547755e-05,3.351994e-01,1.503611e-09,3.309403e-10,0.000438,0.696624,0.995254


In [9]:
# apoe_regress.to_csv('250217_Arivale_Indiv.csv')

### sex stratif

In [10]:
apoe_regress_M = glm_analytes_apoe(analytes_list, df[(df.APOE_Status != 'Not Considered')&(df.sex == 'M')])

In [11]:
# apoe_regress_M.to_csv('250217_Arivale_Indiv_MALE.csv')

In [12]:
apoe_regress_F = glm_analytes_apoe(analytes_list, df[(df.APOE_Status != 'Not Considered')&(df.sex == 'F')])

In [13]:
# apoe_regress_F.to_csv('250217_Arivale_Indiv_FEMALE.csv')

### CA tertile stratif

In [14]:
apoe_ca_bottom = glm_analytes_apoe(analytes_list, 
                                                   df[(df.APOE_Status != 'Not Considered')&
                                                                     (df.CA_tertile == 'bottom')])

In [15]:
# apoe_ca_bottom.to_csv('250303_apoe_bottom_CA_tert.csv')

In [16]:
apoe_ca_mid = glm_analytes_apoe(analytes_list, 
                                                   df[(df.APOE_Status != 'Not Considered')&
                                                                     (df.CA_tertile == 'middle')])

In [17]:
# apoe_ca_mid.to_csv('250303_apoe_mid_CA_tert.csv')

In [18]:
apoe_ca_top = glm_analytes_apoe(analytes_list, 
                                                   df[(df.APOE_Status != 'Not Considered')&
                                                                     (df.CA_tertile == 'top')])

In [19]:
# apoe_ca_top.to_csv('250303_apoe_top_CA_tert.csv')

# glms, delta age statuses

In [20]:
def glm_analytes_health(analyte_list, dat):

    results = []

    for (col1) in analyte_list:

        # Default is gaussian
        family_type = family.Gaussian()
        family_type.link = links.identity()
        family_name = 'Gaussian'
        family_link = 'Identity'

        # Covariance structure
        cov = sm.cov_struct.Exchangeable()

        sub = dat[[col1, 'age', 'sex', 'BMI_CALC','Model_Health_Indiv','APOE_Status','meds_cholesterol','PC1','PC2']].copy() # 'meds_blood_sugar', 'meds_blood_pressure', 'PC3', 'PC4'
        sub.dropna(subset = [col1], inplace=True)
        sub.rename(columns={col1:'module'}, inplace=True)

        try:
            ols_model = 'module ~ C(Model_Health_Indiv, Treatment(reference=1)) + age + C(sex) + BMI_CALC + meds_cholesterol + PC1 + PC2' # + Education'
            fitted_model = smf.glm(ols_model, data=sub, family=family_type, missing='drop').fit(maxiter=2000)
            results.append((col1, len(fitted_model.fittedvalues), fitted_model.converged, *fitted_model.params, *fitted_model.pvalues))

        except Exception as e:
            logger.info('Failed module {} with error {}'.format(col1, str(e)))
            skipped += 1
      
    df = pd.DataFrame(results, columns=['col1', 'n', 'converged', *fitted_model.params.index, *[str(x)+'_p' for x in fitted_model.pvalues.index]])
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(Model_Health_Indiv, Treatment(reference=1))[T.Healthy]_p'].isnull(), 'C(Model_Health_Indiv, Treatment(reference=1))[T.Healthy]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(Model_Health_Indiv, Treatment(reference=1))[T.Healthy]_p'].isnull(), 'Health_pval_adj'] = adj_pval
    df.sort_values(['Health_pval_adj'], ascending=True, inplace=True)
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(Model_Health_Indiv, Treatment(reference=1))[T.Unhealthy]_p'].isnull(), 'C(Model_Health_Indiv, Treatment(reference=1))[T.Unhealthy]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(Model_Health_Indiv, Treatment(reference=1))[T.Unhealthy]_p'].isnull(), 'Unhealth_pval_adj'] = adj_pval
    df.sort_values(['Unhealth_pval_adj'], ascending=True, inplace=True)
    
    return df

In [21]:
health_regress = glm_analytes_health(analytes_list, df)

In [22]:
# health_regress.to_csv('250217_Arivale_Indiv_delage.csv')

In [23]:
health_regress

Unnamed: 0,col1,n,converged,Intercept,"C(Model_Health_Indiv, Treatment(reference=1))[T.Healthy]","C(Model_Health_Indiv, Treatment(reference=1))[T.Unhealthy]",C(sex)[T.M],meds_cholesterol[T.Yes],age,BMI_CALC,...,"C(Model_Health_Indiv, Treatment(reference=1))[T.Healthy]_p","C(Model_Health_Indiv, Treatment(reference=1))[T.Unhealthy]_p",C(sex)[T.M]_p,meds_cholesterol[T.Yes]_p,age_p,BMI_CALC_p,PC1_p,PC2_p,Health_pval_adj,Unhealth_pval_adj
513,"1,5-anhydroglucitol (1,5-AG)",1345,True,0.094830,0.055127,-0.414232,0.375206,-0.228885,-0.011008,0.014261,...,4.666617e-01,6.296873e-10,1.367850e-12,3.624153e-03,4.740442e-06,5.381716e-04,1.181645e-01,0.496102,0.746659,5.641998e-07
490,urea,1345,True,-1.077457,-0.362290,0.365527,0.601131,0.084171,0.021212,-0.006041,...,2.485296e-07,4.017847e-09,1.792769e-34,2.486330e-01,1.926724e-21,1.138797e-01,1.508258e-02,0.096478,0.000223,1.799996e-06
687,glucose,1345,True,-1.883151,-0.162458,0.308195,0.206218,0.212880,0.010990,0.043453,...,2.365248e-02,1.221067e-06,3.978981e-05,4.317329e-03,1.444195e-06,9.643442e-29,2.308836e-02,0.955657,0.203935,3.646920e-04
745,1-(1-enyl-stearoyl)-2-linoleoyl-GPE (P-18:0/18...,1345,True,0.501809,-0.288403,0.318762,0.014491,-0.366239,0.008416,-0.030233,...,1.864194e-04,3.033769e-06,7.882515e-01,4.934473e-06,5.974752e-04,6.039518e-13,3.141189e-07,0.959951,0.014492,6.795642e-04
252,X - 13835,1345,True,-0.649417,-0.245410,0.292233,0.554942,0.027714,0.007006,0.004258,...,1.134296e-03,1.180193e-05,6.361358e-26,7.234777e-01,3.441796e-03,2.993017e-01,4.616828e-03,0.259468,0.039090,1.299553e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,oleoyl-arachidonoyl-glycerol (18:1/20:4) [1]*,1345,True,-1.210070,0.136991,0.000771,0.226676,0.438050,0.001997,0.035144,...,7.470115e-02,9.909518e-01,2.457123e-05,4.116754e-08,4.133921e-01,4.322456e-17,1.257248e-01,0.101157,0.327090,9.953955e-01
40,1-(1-enyl-oleoyl)-GPE (P-18:1)*,1345,True,0.222903,-0.121563,0.000566,0.030965,-0.217938,-0.001300,-0.007568,...,1.098818e-01,9.932880e-01,5.602032e-01,5.800254e-03,5.904616e-01,6.734501e-02,3.192841e-02,0.778296,0.387696,9.966249e-01
191,octadecenedioate (C18:1-DC)*,1345,True,0.733826,0.031094,-0.000332,0.086863,-0.104437,0.004306,-0.034243,...,6.939730e-01,9.962086e-01,1.158703e-01,2.033518e-01,8.627748e-02,1.666081e-15,1.611336e-01,0.625471,0.861799,9.973335e-01
406,choline,1345,True,-1.763822,-0.032779,-0.000315,0.352726,0.177354,0.018985,0.025452,...,6.628056e-01,9.962204e-01,1.918363e-11,2.315240e-02,1.852612e-15,4.883462e-10,7.114257e-01,0.251608,0.854547,9.973335e-01


### sex stratif

In [24]:
health_regress_M = glm_analytes_health(analytes_list, df[(df.sex=='M')])

In [25]:
# health_regress_M.to_csv('250217_Arivale_Indiv_delage_MALE.csv')

In [26]:
health_regress_F = glm_analytes_health(analytes_list, df[(df.sex=='F')])

In [27]:
# health_regress_F.to_csv('250217_Arivale_Indiv_delage_FEMALE.csv')

### CA tertile stratif

In [28]:
health_regress0 = glm_analytes_health(analytes_list, df[(df.CA_tertile == 'bottom')])

In [29]:
# health_regress0.to_csv('250303_verify_bottom_CA_tert.csv')

In [30]:
health_regress1 = glm_analytes_health(analytes_list, df[(df.CA_tertile == 'middle')])

In [31]:
# health_regress1.to_csv('250303_verify_middle_CA_tert.csv')

In [32]:
health_regress2 = glm_analytes_health(analytes_list, df[(df.CA_tertile == 'top')])

In [33]:
# health_regress2.to_csv('250303_verify_top_CA_tert.csv')