# Investigating the impact of brain atrophy on survival in Alzheimer's disease

&emsp;
This notebook is divided into the following topics:
1) Predict survival probability using the Kaplan-Meier estimate for controls, patients with LMCI or AD 
2) Investigate features that impact the survival using the Cox Proportional-Hazards model
2) Investigate the cut-off of regional atrophy that determines low versus high survival in patients with AD
&emsp;

### 1) Predict survival probability using the Kaplan-Meier estimate for controls, patients with LMCI or AD 

In [None]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import os, sys
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from datetime import datetime
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, ConstantKernel as C
from sklearn.preprocessing import StandardScaler

In [None]:
#Import the dataset
ADNIMERGE0=pd.read_csv('ADNIMERGE.csv',low_memory=False)
WD_Reason=pd.read_csv('WD_Reason_TREATDIS.csv',low_memory=False)
ADNIMERGE1=pd.merge(ADNIMERGE0,WD_Reason,on='RID',how='inner')
ADNIMERGE1.AGE.hist()

In [None]:
ADNIMERGE1.AGE.hist()

In [None]:
#Get list of subjects 
ADNIMERGE_subjs_list=ADNIMERGE1.PTID.unique()

In [None]:
#Create a function that gets the max years since baseline indices and translates it to the last entry
def get_max_idx(subjs_list,data):
    max_idx_bl_years=[]
    for subj in subjs_list:
        df=data[data['PTID'].isin([f'{subj}'])]
        subj_idx_max=df.Years_bl.idxmax()
        max_idx_bl_years.append(subj_idx_max)
    return max_idx_bl_years

In [None]:
ADNIMERGE_max_idx_bl_years=get_max_idx(ADNIMERGE_subjs_list,ADNIMERGE1)

In [None]:
#Assign Survival=1 to the max years since baseline or 0 otherwise
ADNIMERGE1['Suvival']=0
ADNIMERGE1.loc[ADNIMERGE_max_idx_bl_years,'Suvival']=1


In [None]:
#Check if subject' death is recorded and create new column as 'death'
ADNIMERGE_dead_idx=ADNIMERGE1.index[(ADNIMERGE1['Suvival'] == 1) &  (ADNIMERGE1['WDREASON']=='2')].tolist()
ADNIMERGE1['death']=0
ADNIMERGE1.loc[ADNIMERGE_dead_idx,'death']=1

In [None]:
ADNIMERGE_dead_idx

In [None]:
CN_data=ADNIMERGE1[ADNIMERGE1['DX_bl'].isin(['CN'])]
CN_data.shape
LMCI_data=ADNIMERGE1[ADNIMERGE1['DX_bl'].isin(['LMCI'])]
LMCI_data.shape
AD_data=ADNIMERGE1[ADNIMERGE1['DX_bl'].isin(['AD'])]
AD_data.shape

In [None]:
ADNIMERGE1['DX_bl']

In [None]:
#Check if any CN transform to LMCI or AD
CN_to_LMCI=LMCI_data[LMCI_data['PTID'].isin(CN_data['PTID'])]
print(CN_to_LMCI.shape)
CN_to_AD=AD_data[AD_data['PTID'].isin(CN_data['PTID'])]
CN_to_AD.shape

In [None]:
ADNIMERGE['Suvival'].value_counts()

In [None]:
ADNIMERGE['death'].value_counts()

In [None]:
#Fit for plotting 'Suvival'
km = KaplanMeierFitter() 
km.fit(CN_data['Years_bl'], CN_data['Suvival'], label='CN survival')
a1 = km.plot()

km.fit(LMCI_data['Years_bl'], LMCI_data['Suvival'], label='LMCI survival')
a2=km.plot(ax=a1)

km.fit(AD_data['Years_bl'], AD_data['Suvival'], label='AD survival')
km.plot(ax=a2)

In [None]:
#Fit for plotting 'death'
km = KaplanMeierFitter() 
km.fit(CN_data['Years_bl'], CN_data['death'], label='CN time till death')
a1 = km.plot()

km.fit(LMCI_data['Years_bl'], LMCI_data['death'], label='LMCI time till death')
a2=km.plot(ax=a1)

km.fit(AD_data['Years_bl'], AD_data['death'], label='AD time till death')
km.plot(ax=a2)

In [None]:
#Fit a separate model and predict survival for a range of years
#Controls
km1 = KaplanMeierFitter() 
km1.fit(CN_data['Years_bl'], CN_data['Suvival'], label='CN survival')
km1.predict([0,3,6,10,15,20])

In [None]:
#Patients with AD
km2 = KaplanMeierFitter() 
km2.fit(AD_data['Years_bl'], AD_data['Suvival'], label='AD survival')
km2.predict([0,3,6,10,15,20])

### 2) Investigate features that impact the survival using the Cox Proportional-Hazards model

In [None]:
from lifelines import CoxPHFitter
data_cox_hz=AD_data[['APOE4','Ventricles','Hippocampus','WholeBrain','Entorhinal','Fusiform','MidTemp','Years_bl','Suvival','AGE','ICV','death']]
data_cox_hz=data_cox_hz.dropna()
cph=CoxPHFitter()
cph.fit(data_cox_hz,'Years_bl',event_col='death')
cph.print_summary()

In [None]:
#In summary, check which features impact the survival from the p-values

In [None]:
data_cox_hz.Suvival.value_counts()

In [None]:
data_cox_hz=AD_data[['APOE4','Ventricles','Hippocampus','WholeBrain','Entorhinal','Fusiform','MidTemp','Years_bl','Suvival','AGE','ICV','death']]
data_cox_hz=data_cox_hz.dropna()
cph=CoxPHFitter(penalizer=0.1)
cph.fit(data_cox_hz,'Years_bl',event_col='Suvival')
cph.print_summary()

### 3) Investigate the cut-off of regional atrophy that determines low versus high survival in patients with AD  

In [None]:
#Define function to regress covariates from regions of interest 
from sklearn import linear_model

def regressOut(y, X, use_fit=None):
    lm = linear_model.LinearRegression()    
    if use_fit is None:
        use_fit = [True] * X.shape[0]
    else:
        use_fit = use_fit.values

    x_mean = X.iloc[use_fit,:].mean()
    lm.fit(X.loc[use_fit,:].values, y.loc[use_fit])
    yhat = lm.predict(X.values)
    #residual
    res = y - yhat

    offset = lm.predict(x_mean.values.reshape(1, -1))

    return(res + offset)


In [None]:
#Duplicate the original df
ADNIMERGE2=ADNIMERGE1[['Years_bl','Suvival','AGE','ICV','Ventricles','Hippocampus','WholeBrain','Entorhinal','Fusiform','MidTemp','death','DX_bl']]
ADNIMERGE2=ADNIMERGE2.dropna()

In [None]:
#Define covs
covars = ['AGE','ICV']
Xlin = ADNIMERGE2.loc[:,covars]
use_fit = (ADNIMERGE2.DX_bl=='CN')

In [None]:
#Investigate for hippocampus, other ROIs can be added as well
tfeat=['Hippocampus']
ADNIMERGE2_Reg=ADNIMERGE2[['Years_bl','Suvival','AGE','ICV','DX_bl','death']]
for fff in tfeat:
    ggg =regressOut(ADNIMERGE2[fff], Xlin, use_fit)
    ADNIMERGE2_Reg.loc[:,fff] = ggg

In [None]:
ADNIMERGE2_Reg

In [None]:
AD_data_Reg=ADNIMERGE2_Reg[ADNIMERGE2_Reg['DX_bl'].isin(['AD'])]
LMCI_data_Reg=ADNIMERGE2_Reg[ADNIMERGE2_Reg['DX_bl'].isin(['LMCI'])]
CN_data_Reg=ADNIMERGE2_Reg[ADNIMERGE2_Reg['DX_bl'].isin(['CN'])]
cph=CoxPHFitter()
cph.fit(AD_data_Reg,'Years_bl',event_col='death')
cph.print_summary()

In [None]:
cph=CoxPHFitter()
cph.fit(AD_data_Reg,'Years_bl',event_col='Suvival')
cph.print_summary()

In [None]:
mean=CN_data_Reg['Hippocampus'].mean()
sd=CN_data_Reg['Hippocampus'].std()
AD_data_Reg['Z_hippo']=(AD_data_Reg['Hippocampus']-mean)/sd
LMCI_data_Reg['Z_hippo']=(LMCI_data_Reg['Hippocampus']-mean)/sd

In [None]:
#compare measures before and after regression
plt.scatter(AD_data_Reg.Hippocampus, AD_data_Reg.AGE)
print(AD_data_Reg.AGE.corr(AD_data_Reg.Hippocampus))
plt.show()

In [None]:
plt.scatter(ADNIMERGE1.Hippocampus, ADNIMERGE1.AGE)
print(ADNIMERGE1.AGE.corr(ADNIMERGE1.Hippocampus))
plt.show()

In [None]:
#Use z-score cut-off -1.63 to group patients with low or normal hippocampal vols
AD_data_ZGPlow_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']<-1.63]
AD_data_ZGPnormal_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']>=-1.63]

km.fit(AD_data_ZGPlow_hippo['Years_bl'], AD_data_ZGPlow_hippo['death'], label='AD low hippo')
a1 = km.plot()

# fit the model for 2nd cohort
km.fit(AD_data_ZGPnormal_hippo['Years_bl'], AD_data_ZGPnormal_hippo['death'], label='AD normal hippo')
km.plot(ax=a1)

In [None]:
from lifelines.statistics import logrank_test
results=logrank_test(AD_data_ZGPlow_hippo['Years_bl'],AD_data_ZGPhigh_hippo['Years_bl'],event_observed_A=AD_data_ZGPlow_hippo['Suvival'], event_observed_B=AD_data_ZGPhigh_hippo['Suvival'])
results.print_summary()

In [None]:
LMCI_data_ZGPlow_hippo=LMCI_data_Reg.loc[LMCI_data_Reg['Z_hippo']<-1.63]
LMCI_data_ZGPhigh_hippo=LMCI_data_Reg.loc[LMCI_data_Reg['Z_hippo']>=-1.63]

km.fit(LMCI_data_ZGPlow_hippo['Years_bl'], LMCI_data_ZGPlow_hippo['death'], label='LMCI low hippo')
a1 = km.plot()

# fit the model for 2nd cohort
km.fit(LMCI_data_ZGPhigh_hippo['Years_bl'], LMCI_data_ZGPhigh_hippo['death'], label='LMCI normal hippo')
km.plot(ax=a1)

In [None]:
results=logrank_test(LMCI_data_ZGPlow_hippo['Years_bl'],LMCI_data_ZGPhigh_hippo['Years_bl'],event_observed_A=LMCI_data_ZGPlow_hippo['Suvival'], event_observed_B=LMCI_data_ZGPhigh_hippo['Suvival'])
results.print_summary()

In [None]:
results=logrank_test(AD_data_ZGPlow_hippo['Years_bl'],AD_data_ZGPhigh_hippo['Years_bl'],event_observed_A=AD_data_ZGPlow_hippo['death'], event_observed_B=AD_data_ZGPhigh_hippo['dead'])
results.print_summary()

In [None]:
def get_best_cutoff(data_ZGP,surv_col):
    cutoff=0
    p_val=0.9
    final_cutoff=0
    while cutoff>-3:
        data_ZGPlow_hippo=data_ZGP.loc[data_ZGP['Z_hippo']<cutoff]
        data_ZGPhigh_hippo=data_ZGP.loc[data_ZGP['Z_hippo']>=cutoff]


        results=logrank_test(data_ZGPlow_hippo['Years_bl'],data_ZGPhigh_hippo['Years_bl'],event_observed_A=data_ZGPlow_hippo[f'{surv_col}'], event_observed_B=data_ZGPhigh_hippo[f'{surv_col}'])
        current_p_val=results.p_value
        if current_p_val<p_val:
            p_val=current_p_val
            final_cutoff=cutoff
        cutoff=cutoff-0.1
    return round(p_val,3), round(final_cutoff,3)

In [None]:
AD_p_val,AD_final_cutoff=get_best_cutoff(AD_data_Reg,'death')
AD_p_val,AD_final_cutoff

In [None]:
#Use the best z-score cut-off
AD_data_ZGPlow_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']<AD_final_cutoff]
AD_data_ZGPhigh_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']>=AD_final_cutoff]

km.fit(AD_data_ZGPlow_hippo['Years_bl'], AD_data_ZGPlow_hippo['death'], label='AD low hippo')
a1 = km.plot()

# fit the model for 2nd cohort
km.fit(AD_data_ZGPhigh_hippo['Years_bl'],AD_data_ZGPhigh_hippo['death'], label='AD normal hippo')
km.plot(ax=a1)

In [None]:
AD_p_val,AD_final_cutoff=get_best_cutoff(AD_data_Reg,'Suvival')
print(AD_p_val,AD_final_cutoff)
AD_data_ZGPlow_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']<AD_final_cutoff]
AD_data_ZGPhigh_hippo=AD_data_Reg.loc[AD_data_Reg['Z_hippo']>=AD_final_cutoff]

km.fit(AD_data_ZGPlow_hippo['Years_bl'], AD_data_ZGPlow_hippo['Suvival'], label='AD low hippo')
a1 = km.plot()

# fit the model for 2nd cohort
km.fit(AD_data_ZGPhigh_hippo['Years_bl'],AD_data_ZGPhigh_hippo['Suvival'], label='AD normal hippo')
km.plot(ax=a1)

In [None]:
LMCI_p_val,LMCI_final_cutoff=get_best_cutoff(LMCI_data_Reg,'death')
LMCI_p_val,LMCI_final_cutoff

In [None]:
LMCI_data_ZGPlow_hippo=LMCI_data_Reg.loc[LMCI_data_Reg['Z_hippo']<LMCI_final_cutoff]
LMCI_data_ZGPhigh_hippo=LMCI_data_Reg.loc[LMCI_data_Reg['Z_hippo']>=LMCI_final_cutoff]

km.fit(LMCI_data_ZGPlow_hippo['Years_bl'], LMCI_data_ZGPlow_hippo['death'], label='LMCI low hippo')
a1 = km.plot()

# fit the model for 2nd cohort
km.fit(LMCI_data_ZGPhigh_hippo['Years_bl'], LMCI_data_ZGPhigh_hippo['death'], label='LMCI normal hippo')
km.plot(ax=a1)

In [None]:
results=logrank_test(LMCI_data_ZGPlow_hippo['Years_bl'],LMCI_data_ZGPhigh_hippo['Years_bl'],event_observed_A=LMCI_data_ZGPlow_hippo['death'], event_observed_B=LMCI_data_ZGPhigh_hippo['death'])
results.print_summary()