In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
def biomarker_negative_controls(df):
    
    '''
    Take a df with biomarker results for participants labelled as controls, and return a df excluding those
    who have a positive result on any one of five markers (or no biomarker information).
    
    To get more info on which biomarkers are positive, uncomment last section. But this is only to give an 
    idea, since some may be positive on more than one marker.
    '''
    
    controls_all = df.loc[df['Research Group'] == 'CN']
    
    # drop participants who don't have any biomarker data
    controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_TEMPORAL_SUVR'], how='all')
    
    controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | 
             (controls['AV45'] < av45_lower) | 
             (controls['ABETA'] > ab42_upper) | 
            (controls['PTAU'] < ptau_lower) | 
            (controls['META_TEMPORAL_SUVR'] < tau_lower)])
    
    no_markers = len(controls_all)-len(controls)
    n_controls = len(controls)
    n_controls_negative = len(controls_biomarker_neg)
    n_dropped = n_controls - n_controls_negative
    
    print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(
        n_controls,
        no_markers, 
        n_dropped,
        n_controls_negative))
    
    '''
    controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)
    print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))
    print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))
    print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))
    print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))
    print ('{} positive/borderline for PET tau (meta temporal suvr).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_TEMPORAL_SUVR'] > tau_lower)])))
    '''
    return (controls_biomarker_neg)


In [3]:
def biomarker_positive_patients(df, diagnoses):
    
    '''
    Take a df with biomarker results for participants labelled as a patient (specifying which), 
    and return a df including only those who have a positive result on any one of five markers. 
    Exclude any with no biomarker information.
    
    To get more info on which biomarkers are negative, uncomment last section. But this is only to give an 
    idea, since some may be negative on more than one marker.
    '''
    
    patients_all = df.loc[df['Research Group'].isin(diagnoses)]
    
    # drop participants who don't have any biomarker data
    patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_TEMPORAL_SUVR'], how='all')
    
    patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | 
             (patients['AV45'] > av45_upper) | 
             (patients['ABETA'] < ab42_lower) | 
             (patients['PTAU'] > ptau_upper) |
            (patients['META_TEMPORAL_SUVR'] > tau_upper)])
    
    no_markers = len(patients_all)-len(patients)
    n_patients = len(patients)
    n_patients_positive = len(patients_biomarker_pos)
    n_dropped = n_patients - n_patients_positive
    
    print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(
        n_patients,
        diagnoses,
        no_markers,
       n_dropped,
      n_patients_positive))
    
    '''
    patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)
    
    print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))
    print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))
    print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))
    print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))
    print ('{} negative/borderline for PET tau (meta temporal suvr).'.format(len(patients_neg[(patients_neg['META_TEMPORAL_SUVR'] < tau_upper)])))
    '''
    return (patients_biomarker_pos)

In [4]:
# paths to files
biomarker_data = Path("__file__").resolve().parents[1] / 'data' / 'master_biomarkers.csv'

In [5]:
# load baseline data
df = pd.read_csv(biomarker_data, index_col=0, header=0)

In [6]:
# replace some biomarker data which has been coded as > or <
df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})
df = df.replace({'PTAU': {'<8': 7, '>120':121}})

df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)

In [7]:
# set thresholds and calculate 5% margin

# CSF biomarkers
ab42_threshold = 980
ab42_margin = (5*ab42_threshold)/100
ab42_upper = ab42_threshold+ab42_margin
ab42_lower = ab42_threshold-ab42_margin

ptau_threshold = 23
ptau_margin = (5*ptau_threshold)/100
ptau_upper = ptau_threshold+ptau_margin
ptau_lower = ptau_threshold-ptau_margin

# PET biomarkers
fbb_threshold = 1.08
fbb_margin = (5*fbb_threshold)/100
fbb_upper = fbb_threshold+fbb_margin
fbb_lower = fbb_threshold-fbb_margin

av45_threshold = 1.08
av45_margin = (5*av45_threshold)/100
av45_upper = av45_threshold+av45_margin
av45_lower = av45_threshold-av45_margin

tau_threshold = 1.37
tau_margin = (5*tau_threshold)/100
tau_upper = tau_threshold+tau_margin
tau_lower = tau_threshold-tau_margin

In [8]:
# run functions to get negative controls and positive patients
controls_bio_neg = biomarker_negative_controls(df)
ad_bio_positive = biomarker_positive_patients(df, ['AD'])
mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI'])

Of 316 controls, 46 dropped due to no biomarker data and 40 due to positive/borderline biomarkers. 276 remaining.
Of 51 ['AD'], 7 dropped due to no biomarker data and 4 due to negative/borderline biomarkers. 47 remaining.
Of 82 ['EMCI', 'LMCI'], 12 dropped due to no biomarker data and 22 due to negative/borderline biomarkers. 60 remaining.


In [11]:
master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])

In [12]:
master_df.to_csv(Path("__file__").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')