In [1]:
import pandas as pd
import numpy as np

#from datetime import datetime # is this needed?
from functools import reduce
from pathlib import Path

In [2]:
# paths to files
adni_data = Path("__file__").resolve().parents[1] / 'data' / 'adni_spreadsheet.csv'
tau_data = Path("__file__").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'
other_biomarker_data = Path("__file__").resolve().parents[1] / 'data' / 'ADNIMERGE.csv'

In [None]:
baseline_df = get_baselines(adni_data)

In [839]:
# load biomarker files
tau_df = load_biomarker_df(tau_data)
other_biomarkers_df = load_biomarker_df(other_biomarker_data)

In [None]:
# create one df per biomarker with closest result within a one year window of participant baseline
tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR')
abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA')
ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU')
av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45')
fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB')

In [888]:
# create list of dataframes (baseline data and all individual biomarkers)

data_frames = [baseline_df, tau, abeta, ptau, av45, fbb]

In [889]:
# merge dataframes

master_biomarkers = reduce(lambda left, right:
             pd.merge_asof(left, right, left_index=True, right_index=True),
             data_frames)
master_biomarkers.head()

Unnamed: 0_level_0,Subject ID,Phase,Sex,Research Group,Visit,Study Date,Age,session,date_lower,date_upper,META_TEMPORAL_SUVR,META_TEMPORAL_SUVR_EXAMDATE,ABETA,ABETA_EXAMDATE,PTAU,PTAU_EXAMDATE,AV45,AV45_EXAMDATE,FBB,FBB_EXAMDATE
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21,011_S_0021,ADNI 3,F,CN,ADNI3 Initial Visit-Cont Pt,1/25/2018,84.9,2018-01-25,2017-01-25,2019-01-25,1.27,2018-02-02,,NaT,,NaT,1.0376,2017-11-27,,NaT
31,023_S_0031,ADNI 3,F,CN,ADNI3 Initial Visit-Cont Pt,4/17/2018,90.3,2018-04-17,2017-04-17,2019-04-17,1.1098,2018-04-24,,2016-01-26,,2016-01-26,1.5034,2018-04-17,,NaT
56,067_S_0056,ADNI 3,F,CN,ADNI3 Year 1 Visit,1/10/2019,82.8,2019-01-10,2018-01-10,2020-01-10,1.2115,2019-01-10,,2010-12-10,,2010-12-10,0.9688,2019-12-03,,NaT
59,067_S_0059,ADNI 3,F,CN,ADNI3 Initial Visit-Cont Pt,12/20/2017,83.0,2017-12-20,2016-12-20,2018-12-20,1.2017,2017-12-12,,2010-12-10,,2010-12-10,0.9898,2017-12-12,,NaT
69,100_S_0069,ADNI 2,M,CN,ADNI2 Year 2 Visit,1/28/2014,81.1,2014-01-28,2013-01-28,2015-01-28,,2018-04-03,,NaT,,NaT,0.9592,2014-01-28,,NaT


In [890]:
# save 
master_biomarkers.to_csv(Path("__file__").resolve().parents[1] / 'data' / 'master_biomarkers.csv')

In [882]:
def get_biomarker(biomarker_df, baseline_df, biomarker):
    
    find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker)
    window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker)
    
    return (window_checked)

In [4]:
def get_baselines(file):
    # load baseline phenotypic data
    pheno = pd.read_csv(file, index_col=0, header=0)

    # keep only the variables of interest
    pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1)

    # convert 'study date' to 'session' in datetime format, to match other spreadsheets
    pheno['session'] = pd.to_datetime(pheno['Study Date'])
    
    # pull out only the subject id and asign it to the index
    pheno_subj = []
    for i in pheno['Subject ID']:
        subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs
        pheno_subj.append(subj)
    
    pheno.index = pheno_subj
    pheno.rename_axis('RID',inplace=True)
    pheno.index = pheno.index.astype('int64')
    
    # separate patients and controls, because (in theory) we can use any control visit as baseline, but
    # for patients we want their actual baseline data
    patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC']
    patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses

    control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses

    # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding
    # any initial visits where patient continued from a previous phase
    bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt', 
                   'ADNIGO Month 3 MRI','ADNIGO Screening MRI']

    patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)]
    
    # rejoin the patients to the controls
    new_df = pd.concat([control_df,patient_df_bl])
    
    # select the earliest visit available for each participant
    new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age
    baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row
    
    # sort df by index
    baseline_df.sort_values(by='RID', inplace=True)
    
    # calculate window for acceptable biomarker data, currently +- 12months
    baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=12)
    baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=12)

    return (baseline_df)

In [639]:
def load_biomarker_df(biomarker_data):

    # load data
    biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False)

    # convert examdate to datetime
    biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE'])

    # sort df by index and date
    biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True)

    # create column from index (useful for later functions)
    biomarker_df['RID'] = biomarker_df.index
    
    return (biomarker_df)

In [824]:
def match_baselines_biomarker(biomarker_df, baseline_df, biomarker):
    
    df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df
    common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs
    biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest'method

    for rid in common_ids:
        participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results
        baseline = baseline_df.loc[rid] #create df of participants baseline data
        session = baseline['session'] #participant's baseline date

        participant_df = participant_df.dropna(subset=[biomarker])

        if participant_df.empty:
            pass
        else:

            idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the closest test date to session
            nearest_date = participant_df.index[idx_nearest]
            nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date

            df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df
    df = df.set_index('RID')        
    
    return (df)

In [880]:
def check_visit_window(biomarker_df, baseline_df, biomarker):
    
    '''
    Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline
    window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge.
    '''
    
    # create new df, merging biomarker data into baseline df
    baseline_bio = baseline_df.join(biomarker_df)
    
    # create mask of date range, between lower and upper acceptable dates
    mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper'])
    
    # fill values collected outside date range with NaN
    baseline_bio[biomarker][~mask_date_range] = np.nan
    
    cols_to_drop = ['Subject ID',
     'Phase',
     'Sex',
     'Research Group',
     'Visit',
     'Study Date',
     'Age',
     'session',
     'date_lower',
     'date_upper']
    
    baseline_bio = baseline_bio.drop(cols_to_drop, axis=1)
    
    return (baseline_bio)

In [198]:
def old_match_baseline_tau(baseline_df, tau_df):
    
    '''
    Group subjects in the tau df, look them up in the baseline df, and if they match return the tau
    value closest to the session date in baseline. Return a list of dfs, one per subject, merge into
    baseline df, and then keep only those within a 12 month window. This seems a silly and expensive way 
    to do this! But, I ran into all sorts of problems with the datetime format and this is the only way 
    I figured it out...
    '''
    
    tau_dfs_list = []
    for tau_id, group in tau_df.groupby(level='RID'):
        for baseline_id, session in zip(baseline_df.index, baseline_df.session):
            if tau_id == baseline_id:
                participant_df = group
                participant_df.set_index('EXAMDATE', inplace=True)
                participant_tau = pd.DataFrame(participant_df['META_TEMPORAL_SUVR'][participant_df.index[[participant_df.index.get_loc(session, method='nearest')]]])
                participant_tau['EXAMDATE'] = participant_tau.index
                participant_tau.index = [baseline_id]
                participant_tau.rename_axis('RID',inplace=True)
                tau_dfs_list.append(participant_tau)
                
    # concatenate individual tau dfs
    master_tau = pd.concat(tau_dfs_list)
    
    # create new df and merge tau data into baseline df
    baseline_tau = baseline_df.join(master_tau)
    
    # create mask of date range, between lower and upper acceptable dates
    mask_date_range = (baseline_tau.EXAMDATE > baseline_tau.date_lower) & (baseline_tau.EXAMDATE < baseline_tau.date_upper)
    
    # fill values collected outside date range with NaN
    baseline_tau['META_TEMPORAL_SUVR'][~mask_date_range] = np.nan
    
    # rename EXAMDATE column - decide later whether keeping this, but useful for sanity check
    baseline_tau.rename(columns={"EXAMDATE": "tau_EXAMDATE"}, inplace=True)
          
    return (baseline_tau)
    
    

In [None]:
def old_get_tau(file):
    # load tau data
    tau_df = pd.read_csv(tau_data, index_col=0, header=0, low_memory=False)

    # keep only relevant columns
    tau_df = tau_df.filter(['EXAMDATE','META_TEMPORAL_SUVR'], axis=1)

    # convert to datetime
    tau_df['EXAMDATE'] = pd.to_datetime(tau_df['EXAMDATE'])

    # sort tau df by index and date. Also throws an error
    tau_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True)
    
    # create column from index (useful for later functions)
    tau_df['RID'] = tau_df.index
    
    return (tau_df)

In [823]:
def old_match_baselines_biomarker(biomarker_df, baseline_df, biomarker):
    
    df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df
    common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs
    biomarker_df = biomarker_df[biomarker+'_EXAMDATE', biomarker]
    biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest'method

    for rid in common_ids:
        participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results
        baseline = baseline_df.loc[rid] #create df of participants baseline data
        session = baseline['session'] #participant's baseline date

        idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the closest test date to session
        nearest_date = participant_df.index[idx_nearest]
        nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date

        df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df

    return (df)

In [None]:
def old_merge_biomarkers(biomarker_df, baseline_df, biomarker):
    
    # create new df, merging biomarker data into baseline df
    baseline_bio = baseline_df.join(biomarker_df)
    
    # create mask of date range, between lower and upper acceptable dates
    mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper'])
    
    # fill values collected outside date range with NaN
    baseline_bio[biomarker][~mask_date_range] = np.nan
    
    return (baseline_bio)