# TwinsUK individual metabolite abundance analyses

Inputs: E1199_15122022_1_TwinsUK_Metab_Abundance.csv  
Outputs: Data organized into Figure 4; Supplementary File 4

'Save' lines are commented out

In [1]:
import pandas as pd
from IPython.display import display
# get the library
%matplotlib inline
import matplotlib.pyplot as plt
# in case I am doing graphs already
import numpy as np
import scipy.stats as sp
import seaborn as sns
from matplotlib import pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.sandbox.stats.multicomp
from statsmodels.genmod.families import family, links
from analytics.util.analytics_logger import GetAnalyticsLogger
import logging
logger = GetAnalyticsLogger()
logger.setLevel(logging.INFO)

In [2]:
df_to_use = pd.read_csv('/notebooks/0. APOE-Multiomics/Data_Files/E1199_15122022_1_TwinsUK_Metab_Abundance.csv').drop(columns=["Unnamed: 0"])

In [3]:
metab_list = df_to_use.columns.to_list()[13:]

In [4]:
df_to_use[metab_list] = sp.zscore(df_to_use[metab_list], nan_policy='omit')

In [5]:
df_analysis = df_to_use.copy()

# glm of analytes

In [6]:
def glm_analytes_apoe(metab_list, dat):

    results = []

    for (col1) in metab_list:

        # Default is gaussian
        family_type = family.Gaussian()
        family_type.link = links.identity()
        family_name = 'Gaussian'
        family_link = 'Identity'

        # Covariance structure
        cov = sm.cov_struct.Exchangeable()

        sub = dat[[col1, 'Age', 'Sex', 'BMI', 'APOE_Status', 'Statin_User', 'MetBatch']].copy() # 'MetBatch',
        sub.dropna(subset = [col1], inplace=True)
        sub.rename(columns={col1:'module'}, inplace=True)

        try:
            ols_model = 'module ~ C(APOE_Status, Treatment(reference=1)) + Age + C(Sex) + BMI + Statin_User + MetBatch'
            fitted_model = smf.glm(ols_model, data=sub, family=family_type, missing='drop').fit(maxiter=2000)
            results.append((col1, len(fitted_model.fittedvalues), fitted_model.converged, *fitted_model.params, *fitted_model.pvalues))

        except Exception as e:
            logger.info('Failed module {} with error {}'.format(col1, str(e)))
            skipped += 1
      
    df = pd.DataFrame(results, columns=['col1', 'n', 'converged', *fitted_model.params.index, *[str(x)+'_p' for x in fitted_model.pvalues.index]])
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E2]_p'].isnull(), 'C(APOE_Status, Treatment(reference=1))[T.E2]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E2]_p'].isnull(), 'E2_pval_adj'] = adj_pval
    df.sort_values(['E2_pval_adj'], ascending=True, inplace=True)
    
    np.seterr(all='warn')
    (adj_pval_index, adj_pval, _, _) = statsmodels.sandbox.stats.multicomp.multipletests(df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E4]_p'].isnull(), 'C(APOE_Status, Treatment(reference=1))[T.E4]_p'], alpha=0.05, method='fdr_bh')
    df.loc[~df['C(APOE_Status, Treatment(reference=1))[T.E4]_p'].isnull(), 'E4_pval_adj'] = adj_pval
    df.sort_values(['E4_pval_adj'], ascending=True, inplace=True)
    
    return df

In [7]:
apoe_regress = glm_analytes_apoe(metab_list, df_analysis[(df_analysis.APOE_Status != 'Not Considered')])

In [8]:
apoe_regress

Unnamed: 0,col1,n,converged,Intercept,"C(APOE_Status, Treatment(reference=1))[T.E2]","C(APOE_Status, Treatment(reference=1))[T.E4]",C(Sex)[T.M],Statin_User[T.Yes],MetBatch[T.b2],MetBatch[T.b3],...,C(Sex)[T.M]_p,Statin_User[T.Yes]_p,MetBatch[T.b2]_p,MetBatch[T.b3]_p,MetBatch[T.b4]_p,MetBatch[T.b5]_p,Age_p,BMI_p,E2_pval_adj,E4_pval_adj
67,1-margaroyl-GPC (17:0),1656,True,-0.584891,-0.005457,0.165287,0.052663,0.015831,0.136707,0.018540,...,0.676128,0.906711,0.063394,0.800842,0.056212,0.818089,5.754193e-22,3.196638e-11,0.981830,0.300847
368,glycylphenylalanine,1656,True,-0.942430,0.046301,0.159027,-0.352317,-0.115157,-0.048625,0.106430,...,0.007053,0.411187,0.524402,0.162705,0.684708,0.031719,1.530133e-02,5.501338e-04,0.863358,0.300847
182,3-carboxy-4-methyl-5-propyl-2-furanpropanoate ...,1656,True,-1.536890,0.120558,0.155367,0.099976,0.270997,0.095628,0.221917,...,0.429767,0.045799,0.196047,0.002646,0.538985,0.624355,1.021357e-25,3.141606e-01,0.550651,0.300847
513,prolylproline,1656,True,-0.965791,0.074119,0.182194,-0.272186,-0.179239,-0.065436,0.057339,...,0.036830,0.199538,0.390244,0.450658,0.160881,0.441172,3.418189e-01,3.162584e-08,0.722863,0.300847
112,1-palmitoylglycerol (16:0),1656,True,-2.232033,0.011442,0.172588,0.145076,0.150123,-0.155452,-0.023420,...,0.253529,0.270213,0.036220,0.751882,0.227334,0.289942,2.075086e-17,2.663965e-11,0.958537,0.300847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,X - 24309,1656,True,-0.697453,-0.047948,-0.000589,-0.081536,0.015435,-0.032445,-0.084770,...,0.533982,0.912515,0.671813,0.267402,0.467563,0.090122,6.347427e-01,1.429694e-06,0.859219,0.999413
596,X - 10458,1656,True,-0.983037,0.041046,0.000371,-0.031539,0.107287,-0.144850,0.021773,...,0.808813,0.442455,0.057134,0.774501,0.556413,0.000033,1.508854e-04,8.330599e-03,0.871549,0.999413
301,cysteine sulfinic acid,1656,True,-0.889922,0.123525,-0.000509,0.028881,0.072805,-0.135022,-0.056026,...,0.826480,0.606077,0.079357,0.465758,0.328182,0.154767,1.323748e-01,3.318385e-06,0.561549,0.999413
634,X - 12216,1656,True,-0.175441,0.016715,0.000064,-0.096376,-0.024815,0.069304,0.008800,...,0.461473,0.859555,0.364646,0.908178,0.027158,0.511889,1.377292e-06,1.793956e-05,0.947911,0.999413


In [9]:
# apoe_regress.to_csv('250217_TwinsUK_Indiv.csv', index=True)

### sex stratif

In [10]:
apoe_regress_M = glm_analytes_apoe(metab_list, df_analysis[(df_analysis.APOE_Status != 'Not Considered')&(df_analysis.Sex == 'M')])

In [11]:
# apoe_regress_M.to_csv('250217_TwinsUK_Indiv_MALE.csv', index=True)

In [12]:
apoe_regress_F = glm_analytes_apoe(metab_list, df_analysis[(df_analysis.APOE_Status != 'Not Considered')&(df_analysis.Sex == 'F')])

In [13]:
# apoe_regress_F.to_csv('250217_TwinsUK_Indiv_FEMALE.csv', index=True)

### CA tertile stratif

In [14]:
df_analysis_tert = df_analysis.copy()

In [15]:
df_analysis_tert['CA_tertile'] = pd.qcut(df_analysis_tert['Age'], q=3, labels = ['bottom','middle','top'])

In [16]:
pd.qcut(df_analysis_tert['Age'], q=3).value_counts()

(32.849000000000004, 47.72]    568
(55.05, 73.69]                 565
(47.72, 55.05]                 563
Name: Age, dtype: int64

In [17]:
apoe_regress0 = glm_analytes_apoe(metab_list, df_analysis_tert[(df_analysis_tert.CA_tertile == 'bottom')&(df_analysis_tert.APOE_Status != 'Not Considered')])

In [18]:
apoe_regress1 = glm_analytes_apoe(metab_list, df_analysis_tert[(df_analysis_tert.CA_tertile == 'middle')&(df_analysis_tert.APOE_Status != 'Not Considered')])

In [19]:
apoe_regress2 = glm_analytes_apoe(metab_list, df_analysis_tert[(df_analysis_tert.CA_tertile == 'top')&(df_analysis_tert.APOE_Status != 'Not Considered')])

In [20]:
# apoe_regress0.to_csv('250310_TwinsUK_bottomCAtert.csv')

In [21]:
# apoe_regress1.to_csv('250310_TwinsUK_midCAtert.csv')

In [22]:
# apoe_regress2.to_csv('250310_TwinsUK_topAtert.csv')