In [1]:
import pandas as pd
import numpy as np
import importlib
import re
import med_utils
importlib.reload(med_utils)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
NACC_df = pd.read_csv("../raw_datasets/NACC/investigator_ftldlbd_nacc66.csv", low_memory=False)

In [3]:
NACC_df_clean = med_utils.clean_dataframe_advanced(NACC_df, missing_values=[-4, -4.4, 8, 9, 88, 99, 97, 98, 96, 95, 888, 999, 8888, 9999, 88.8, 888.8, 99.9, 777], threshold=1, drop_rows_threshold=None, verbose=True)
NACC_df_clean['NACCID'] = NACC_df_clean['NACCID'].str.replace('NACC', '').astype(int)
NACC_df_clean["NACC_VISDATE"]=pd.to_datetime(dict(year=NACC_df_clean["VISITYR"],month=NACC_df_clean["VISITMO"],day=NACC_df_clean["VISITDAY"]),errors="coerce").dt.strftime("%Y-%m-%d")
print("Missing NACCAGE records:", NACC_df_clean["NACCAGE"].isna().sum())
NACC_df_clean = NACC_df_clean.dropna(subset=['NACCAGE'])
print("\n ============================================= \n")
NACC_df_clean = med_utils.add_age_column_NACC(NACC_df_clean)


Original shape: (192088, 1936)
Missing values replaced: [-4, -4.4, 8, 9, 88, 99, 97, 98, 96, 95, 888, 999, 8888, 9999, 88.8, 888.8, 99.9, 777]
Columns dropped: 0 ([])
Final shape: (192088, 1936)
Missing NACCAGE records: 5563


Valid visit dates: 144068
Valid birth dates: 153852
Valid calculated ages: 119033
Sample AGE values: [70.66, 71.73, 66.95, 63.76, 77.78, 81.39, 86.37, 87.47, nan, 76.49]
NACCAGE column found - processing age comparisons and imputations
Missing NACCAGE values imputed with calculated AGE: 0
AGE values replaced with NACCAGE (difference < 1 year): 118605


In [4]:
# duplicates = NACC_df_clean.duplicated(subset=["NACCID", "NACCAGE"], keep=False)
# print("Number of duplicated [ID, AGE] records:", duplicates.sum())
# dupes = NACC_df_clean[NACC_df_clean.duplicated(subset=["NACCID", "NACCAGE"], keep=False)][['NACCID', "VISITYR", "VISITMO", "VISITDAY", 'BIRTHYR', 'BIRTHMO', "NACC_VISDATE", "NACCAGE"]]
# dupes

In [5]:
print(f"Initial rows: {len(NACC_df_clean)}")
mask_dupes = NACC_df_clean.duplicated(subset=["NACCID", "NACCAGE"], keep=False)
print(f"Duplicate [NACCID, NACCAGE] rows to drop: {mask_dupes.sum()}")
NACC_df_clean = NACC_df_clean.loc[~mask_dupes].copy()
print(f"After dropping duplicates: {len(NACC_df_clean)}")

# drop rows with NACCAGE or NACCUDSD (status) missing
mask_missing = NACC_df_clean["NACCAGE"].isna() | NACC_df_clean["NACCUDSD"].isna()
print(f"Rows with missing NACCAGE or NACCUDSD to drop: {mask_missing.sum()}")
NACC_df_clean = NACC_df_clean.loc[~mask_missing].copy()
print(f"After dropping missings: {len(NACC_df_clean)}")


Initial rows: 186525
Duplicate [NACCID, NACCAGE] rows to drop: 1752
After dropping duplicates: 184773
Rows with missing NACCAGE or NACCUDSD to drop: 0
After dropping missings: 184773


In [6]:
# add for the  visit_no
NACC_df_clean = NACC_df_clean.sort_values(['NACCID','NACCAGE'])
NACC_df_clean['NACC_VISCODE'] = NACC_df_clean.groupby('NACCID').cumcount()+1

In [7]:
medication_columns = [col for col in NACC_df_clean.columns if col.startswith("DRUG")]
nacc_df_selected = med_utils.rename_and_select_columns(NACC_df_clean, "../raw_datasets/naming_convension.xlsx", "NACC", "New_Name", extra_cols=medication_columns)

In [8]:
# Status -> 1: HC; 3:MCI; 4:AD
# 1: Male; 2: female;
print("The dimension of the selected NACC dataframe is:", nacc_df_selected.shape, "with", nacc_df_selected['id'].nunique(), "unique participants.")
nacc_df_selected['status'] = nacc_df_selected['status'].map({1:'HC' , 3:'MCI', 4:'AD'})
nacc_df_selected = nacc_df_selected[nacc_df_selected['status'].isin(['HC', 'MCI', 'AD'])]
nacc_df_selected['sex'] = nacc_df_selected['sex'].map({1:0 , 2:1})
# 1= e3, e3; 2= e3, e4; 3= e3, e2; 4= e4, e4; 5= e4, e2; 6= e2, e2, 9 =missing
mapping_APOE4 = {
    2: 1,  # e3/e4
    4: 1,  # e4/e4
    5: 1,  # e4/e2
    1: 0,  # e3/e3
    3: 0,  # e3/e2
    6: 0,  # e2/e2
    9: np.nan  # missing
}
nacc_df_selected["APOE4"] = nacc_df_selected["APOE4"].map(mapping_APOE4)
print("The dimension of the selected NACC dataframe after drop unrelated with AD is:", nacc_df_selected.shape, "with", nacc_df_selected['id'].nunique(), "unique participants.")
nacc_df_selected = med_utils.clean_status_and_demos(nacc_df_selected)
nacc_df_selected = nacc_df_selected.dropna(subset=['status'])
nacc_df_selected = nacc_df_selected[~(nacc_df_selected['MMSE'].isna() & nacc_df_selected['CDR'].isna())]
nacc_df_selected = nacc_df_selected.reset_index(drop=True)
print("The dimension of the selected NACC dataframe is:", nacc_df_selected.shape, "with", nacc_df_selected['id'].nunique(), "unique participants.")
med_df=nacc_df_selected[medication_columns]
selected_columns = nacc_df_selected.columns.tolist()

The dimension of the selected NACC dataframe is: (184773, 50) with 51486 unique participants.
The dimension of the selected NACC dataframe after drop unrelated with AD is: (176560, 50) with 50431 unique participants.
Filled 0 missing status records; columns forward/back filled: ['sex', 'edu', 'APOE4']
The dimension of the selected NACC dataframe is: (174587, 50) with 50184 unique participants.


### angiotensin-converting enzyme inhibitors (ACEi)

In [9]:
acei_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ACEi')
acei_names = list(set(acei_files['Drug_name'].dropna()).union(acei_files['Brand_name'].dropna()))

In [10]:
acei_matched_values = med_utils.get_matched_values(med_df, acei_names)

In [11]:
acei_matched_values

['FOSINOPRIL',
 'HYDROCHLOROTHIAZIDE-QUINAPRIL',
 'TRANDOLAPRIL-VERAPAMIL',
 'QUINAPRIL',
 'ENALAPRIL-FELODIPINE',
 'RAMIPRIL',
 'ENALAPRIL',
 'AMLODIPINE-BENAZEPRIL',
 'TRANDOLAPRIL',
 'PERINDOPRIL',
 'BENAZEPRIL',
 'DILTIAZEM-ENALAPRIL',
 'FOSINOPRIL-HYDROCHLOROTHIAZIDE',
 'LISINOPRIL',
 'HYDROCHLOROTHIAZIDE-LISINOPRIL',
 'BENAZEPRIL-HYDROCHLOROTHIAZIDE',
 'ENALAPRIL-HYDROCHLOROTHIAZIDE',
 'CAPTOPRIL-HYDROCHLOROTHIAZIDE',
 'MOEXIPRIL',
 'CAPTOPRIL',
 'HYDROCHLOROTHIAZIDE-MOEXIPRIL']

In [12]:
# values_to_drop_aceinames = {}
# acei_matched_values = [x for x in acei_matched_values if x not in values_to_drop_aceinames]

In [13]:
acei_all = med_utils.medication_all_timepoints(
    nacc_df_selected, acei_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# acei_all.to_csv("../preprocessed_data/NACC/acei_all_results_NACC_missing.csv",index=False)

### angiotensin receptor blockers (ARBs)

In [14]:
arb_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ARB')
arb_names = list(set(arb_files['Drug_name'].dropna()).union(arb_files['Brand_name'].dropna()))

In [15]:
arb_matched_values = med_utils.get_matched_values(med_df, arb_names)

In [16]:
arb_matched_values

['AMLODIPINE-OLMESARTAN',
 'HYDROCHLOROTHIAZIDE-LOSARTAN',
 'CANDESARTAN-HYDROCHLOROTHIAZIDE',
 'CANDESARTAN',
 'AZILSARTAN',
 'AMLODIPINE-TELMISARTAN',
 'AMLODIPINE-VALSARTAN',
 'OLMESARTAN',
 'AZILSARTAN-CHLORTHALIDONE',
 'HYDROCHLOROTHIAZIDE-IRBESARTAN',
 'SACUBITRIL-VALSARTAN',
 'LOSARTAN',
 'HYDROCHLOROTHIAZIDE-OLMESARTAN',
 'HYDROCHLOROTHIAZIDE-TELMISARTAN',
 'AMLODIPINE/HYDROCHLOROTHIAZIDE/OLMESARTAN',
 'EPROSARTAN-HYDROCHLOROTHIAZIDE',
 'VALSARTAN',
 'EPROSARTAN',
 'HYDROCHLOROTHIAZIDE-VALSARTAN',
 'ALISKIREN-VALSARTAN',
 'AMLODIPINE/HYDROCHLOROTHIAZIDE/VALSARTAN',
 'IRBESARTAN',
 'TELMISARTAN']

In [17]:
# values_to_drop_arbnames = {}
# arb_matched_values = [x for x in arb_matched_values if x not in values_to_drop_arbnames]

In [18]:
arb_all = med_utils.medication_all_timepoints(
    nacc_df_selected, arb_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# arb_all.to_csv("../preprocessed_data/NACC/arb_all_results_NACC_missing.csv",index=False)

### Beta Blocker

In [19]:
beta_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='beta_blockers')
beta_blockers_names = list(set(beta_blockers_files['Drug_name'].dropna()).union(beta_blockers_files['Brand_name'].dropna()))

In [20]:
beta_blockers_matched_values = med_utils.get_matched_values(med_df, beta_blockers_names)

In [21]:
beta_blockers_matched_values

['DORZOLAMIDE-TIMOLOL OPHTHALMIC',
 'LATANOPROST-TIMOLOL OPHTHALMIC',
 'HYDROCHLOROTHIAZIDE-TIMOLOL',
 'BENDROFLUMETHIAZIDE-NADOLOL',
 'BETAXOLOL OPHTHALMIC',
 'LABETALOL',
 'BETAXOLOL',
 'ATENOLOL',
 'CARVEDILOL',
 'NEBIVOLOL',
 'HYDROCHLOROTHIAZIDE-PROPRANOLOL',
 'BRIMONIDINE-TIMOLOL OPHTHALMIC',
 'SOTALOL',
 'BISOPROLOL',
 'NADOLOL',
 'LEVOBETAXOLOL OPHTHALMIC',
 'BYSTOLIC',
 'PINDOLOL',
 'BRIMONIDINE/DORZOLAMIDE/TIMOLOL OPHTH',
 'DORZOLAMIDE/LATANOPROST/TIMOLOL OPHTHALMIC',
 'BISOPROLOL-HYDROCHLOROTHIAZIDE',
 'HYDROCHLOROTHIAZIDE-METOPROLOL',
 'TIMOLOL',
 'ACEBUTOLOL',
 'PROPRANOLOL',
 'TIMOLOL OPHTHALMIC',
 'METOPROLOL']

In [22]:
# values_to_drop_BetaBlknames = {}
# beta_blockers_matched_values = [x for x in beta_blockers_matched_values if x not in values_to_drop_BetaBlknames]

In [23]:
beta_blockers_all = med_utils.medication_all_timepoints(
    nacc_df_selected, beta_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# beta_blockers_all.to_csv("../preprocessed_data/NACC/beta_blockers_all_results_NACC_missing.csv",index=False)

### Calcium Channel Blockers

In [24]:
Ca_channel_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='Ca_channel_blockers')
Ca_channel_blockers_names = list(set(Ca_channel_blockers_files['Drug_name'].dropna()).union(Ca_channel_blockers_files['Brand_name'].dropna()))

In [25]:
Ca_channel_blockers_matched_values = med_utils.get_matched_values(med_df, Ca_channel_blockers_names)

In [26]:
Ca_channel_blockers_matched_values

['NICARDIPINE',
 'AMLODIPINE-OLMESARTAN',
 'AMLODIPINE-TELMISARTAN',
 'AMLODIPINE',
 'TRANDOLAPRIL-VERAPAMIL',
 'AMLODIPINE-VALSARTAN',
 'ENALAPRIL-FELODIPINE',
 'ISRADIPINE',
 'AMLODIPINE-BENAZEPRIL',
 'VERAPAMIL',
 'AMLODIPINE/HYDROCHLOROTHIAZIDE/OLMESARTAN',
 'AMLODIPINE-ATORVASTATIN',
 'BEPRIDIL',
 'NIFEDIPINE',
 'FELODIPINE',
 'AMLODIPINE/HYDROCHLOROTHIAZIDE/VALSARTAN',
 'ALISKIREN-AMLODIPINE',
 'NISOLDIPINE',
 'DILTIAZEM']

In [27]:
# values_to_drop_ccbnames = {}
# Ca_channel_blockers_matched_values = [x for x in Ca_channel_blockers_matched_values if x not in values_to_drop_ccbnames]

In [28]:
Ca_channel_blockers_all = med_utils.medication_all_timepoints(
    nacc_df_selected, Ca_channel_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# Ca_channel_blockers_all.to_csv("../preprocessed_data/NACC/Ca_channel_blockers_all_results_NACC_missing.csv",index=False)

### Diuretics

In [29]:
diuretics_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='diuretics')
diuretics_names = list(set(diuretics_files['Drug_name'].dropna()).union(diuretics_files['Brand_name'].dropna()))

In [30]:
diuretics_matched_values = med_utils.get_matched_values(med_df, diuretics_names)

In [31]:
diuretics_matched_values

['CHLORTHALIDONE',
 'INDAPAMIDE',
 'BUMETANIDE',
 'HYDROCHLOROTHIAZIDE-TIMOLOL',
 'CHLOROTHIAZIDE-METHYLDOPA',
 'FUROSEMIDE',
 'HYDROCHLOROTHIAZIDE-RESERPINE',
 'HYDROCHLOROTHIAZIDE-LOSARTAN',
 'CANDESARTAN-HYDROCHLOROTHIAZIDE',
 'BENDROFLUMETHIAZIDE-NADOLOL',
 'HYDRALAZINE/HYDROCHLOROTHIAZIDE/RESERPINE',
 'HYDROCHLOROTHIAZIDE-QUINAPRIL',
 'MANNITOL',
 'HYDROCHLOROTHIAZIDE-PROPRANOLOL',
 'HYDROCHLOROTHIAZIDE-TRIAMTERENE',
 'TORSEMIDE',
 'METHAZOLAMIDE',
 'POLYTHIAZIDE-RESERPINE',
 'NADOLOL',
 'CHLOROTHIAZIDE-RESERPINE',
 'AZILSARTAN-CHLORTHALIDONE',
 'BENDROFLUMETHIAZIDE',
 'HYDROCHLOROTHIAZIDE-IRBESARTAN',
 'ACETAZOLAMIDE',
 'HYDROCHLOROTHIAZIDE-OLMESARTAN',
 'HYDROCHLOROTHIAZIDE-TELMISARTAN',
 'HYDROCHLOROTHIAZIDE-SPIRONOLACTONE',
 'AMLODIPINE/HYDROCHLOROTHIAZIDE/OLMESARTAN',
 'EPROSARTAN-HYDROCHLOROTHIAZIDE',
 'HYDROCHLOROTHIAZIDE-METHYLDOPA',
 'METOLAZONE',
 'FOSINOPRIL-HYDROCHLOROTHIAZIDE',
 'BISOPROLOL-HYDROCHLOROTHIAZIDE',
 'SPIRONOLACTONE',
 'HYDROCHLOROTHIAZIDE-METOPROLOL',
 '

In [32]:
# values_to_drop_diureticsnames = {}
# diuretics_matched_values = [x for x in diuretics_matched_values if x not in values_to_drop_diureticsnames]

In [33]:
diuretics_all = med_utils.medication_all_timepoints(
    nacc_df_selected, diuretics_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# diuretics_all.to_csv("../preprocessed_data/NACC/diuretics_all_results_NACC_missing.csv",index=False)

### statins 

In [34]:
statin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='statin')
statin_names = list(set(statin_files['Drug_name'].dropna()).union(statin_files['Brand_name'].dropna()))

In [35]:
statin_matched_values = med_utils.get_matched_values(med_df, statin_names)

In [36]:
statin_matched_values

['LOVASTATIN-NIACIN',
 'SIMVASTATIN',
 'NIACIN-SIMVASTATIN',
 'LOVASTATIN',
 'CERIVASTATIN',
 'AMLODIPINE-ATORVASTATIN',
 'ASPIRIN-PRAVASTATIN',
 'PRAVASTATIN',
 'SIMVASTATIN-SITAGLIPTIN',
 'ATORVASTATIN',
 'FLUVASTATIN',
 'ATORVASTATIN-EZETIMIBE',
 'PITAVASTATIN',
 'ROSUVASTATIN',
 'EZETIMIBE-SIMVASTATIN']

In [37]:
# values_to_drop_statinnames = {}
# statin_matched_values = [x for x in statin_matched_values if x not in values_to_drop_statinnames]

In [38]:
statin_all = med_utils.medication_all_timepoints(
    nacc_df_selected, statin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# statin_all.to_csv("../preprocessed_data/NACC/statin_all_results_NACC_missing.csv",index=False)

### metformin

In [39]:
metformin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='metformin')
metformin_names = list(set(metformin_files['Drug_name'].dropna()).union(metformin_files['Brand_name'].dropna()))

In [40]:
metformin_matched_values = med_utils.get_matched_values(med_df, metformin_names)

In [41]:
metformin_matched_values

['EMPAGLIFLOZIN/LINAGLIPTIN/METFORMIN',
 'METFORMIN-REPAGLINIDE',
 'METFORMIN-SAXAGLIPTIN',
 'GLIPIZIDE-METFORMIN',
 'METFORMIN-ROSIGLITAZONE',
 'GLYBURIDE-METFORMIN',
 'METFORMIN-PIOGLITAZONE',
 'ALOGLIPTIN-METFORMIN',
 'DAPAGLIFLOZIN-METFORMIN',
 'CANAGLIFLOZIN-METFORMIN',
 'METFORMIN-SITAGLIPTIN',
 'LINAGLIPTIN-METFORMIN',
 'METFORMIN',
 'EMPAGLIFLOZIN-METFORMIN']

In [42]:
# values_to_drop_metforminnames = {}
# metformin_matched_values = [x for x in metformin_matched_values if x not in values_to_drop_metforminnames]

In [43]:
metformin_all = med_utils.medication_all_timepoints(
    nacc_df_selected, metformin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# metformin_all.to_csv("../preprocessed_data/NACC/metformin_all_results_NACC_missing.csv",index=False)

### control group
never have any of the 5 type of medications

In [44]:
drug_lists = [
    acei_matched_values,
    arb_matched_values,
    beta_blockers_matched_values,
    Ca_channel_blockers_matched_values,
    diuretics_matched_values,
    statin_matched_values,
    metformin_matched_values
]
all_target_drugs = set().union(*drug_lists)

In [45]:
row_has_target_drug = (
    nacc_df_selected[medication_columns]
      .applymap(lambda x: med_utils.contains_any_medication_type(x, all_target_drugs))
      .any(axis=1)
)
ids_with_any_target_drug = nacc_df_selected.loc[row_has_target_drug, 'id'].unique()
controls_df = nacc_df_selected[~nacc_df_selected['id'].isin(ids_with_any_target_drug)].copy()


In [46]:
controls_df["Has_Medication_This_Visit"] = False
controls_df = controls_df.drop(columns=[c for c in medication_columns if c in controls_df.columns])
# controls_df.to_csv("../preprocessed_data/NACC/controls_df_NACC_missing.csv",index=False)

### Adjust for the later modelling in R

In [47]:
datasets = {
    "ACEi": acei_all,
    "ARB": arb_all,
    "BetaBlk": beta_blockers_all,
    "CCB": Ca_channel_blockers_all,
    "Diuretic": diuretics_all,
    "Statin": statin_all,
    "Metformin": metformin_all,
    "Control": controls_df  
}

In [48]:
merged_df = med_utils.merge_medication_longitudinal(datasets, control_key="Control", extra_cols=["visit_no", "MMSE", "CDR"])

In [49]:
print(f"Initial rows: {len(merged_df)}")
merged_clean = merged_df.dropna(subset=['status'])
merged_clean = merged_clean[~(merged_clean['MMSE'].isna() & merged_clean['CDR'].isna())]
print(f"After dropping missings: {len(merged_clean)}")
merged_clean = merged_clean[merged_clean.groupby('id')['visit_no'].transform('nunique') > 1]
print(f"After removing number of records: {len(merged_clean)}")

Initial rows: 135338
After dropping missings: 135338
After removing number of records: 118567


In [50]:
print(merged_clean["visit_date"].isna().sum())

0


In [51]:
merged_clean['visit_date'] = pd.to_datetime(merged_clean['visit_date'])
merged_clean['months_since_baseline'] = (
    merged_clean.groupby('id')['visit_date']
      .transform(lambda x: (x - x.min()) / pd.Timedelta(days=30.4375))
      .round()
    .astype(int)
)

In [52]:
merged_clean = med_utils.clean_and_filter_participants(
    merged_clean,
    id_col='id',
    class_col ='status',
    time_col = 'visit_no',
    medication_ever_col=None,
    min_visits=2,
    if_print=True
)

Remaining participants: 28,440
Dropped participants:   0


In [53]:
merged_clean[merged_clean.duplicated(subset=['id', 'visit_no'], keep=False)].sort_values(['id','visit_no'])

Unnamed: 0,id,visit_date,status,age,sex,edu,APOE4,ACEi,ARB,BetaBlk,CCB,Diuretic,Statin,Metformin,Total_Meds,visit_no,MMSE,CDR,months_since_baseline


In [54]:
merged_clean = med_utils.clean_longitudinal(merged_clean)

In [55]:
max_tab = merged_clean.groupby('id')['Total_Meds'].max().value_counts()
print("\nDistribution of max Total_Meds per participant:\n", max_tab)



Distribution of max Total_Meds per participant:
 Total_Meds
0    7075
1    6891
2    6002
3    4665
4    2685
5     940
6     174
7       8
Name: count, dtype: int64


In [56]:
merged_clean.to_csv("../preprocessed_data/NACC/NACC_merge_full_missing.csv",index=False)

In [57]:
merged_clean

Unnamed: 0,id,visit_date,status,age,sex,edu,APOE4,ACEi,ARB,BetaBlk,CCB,Diuretic,Statin,Metformin,Total_Meds,visit_no,MMSE,CDR,months_since_baseline
0,11,2006-04-17,MCI,62.21,1,16.0,0.0,False,False,False,False,False,False,False,0,1,30.0,0.5,0
1,11,2007-06-18,MCI,63.38,1,16.0,0.0,False,False,False,False,False,False,False,0,2,29.0,0.5,14
2,11,2008-06-03,MCI,64.34,1,16.0,0.0,False,False,False,False,False,False,False,0,3,30.0,0.5,26
3,34,2015-07-16,MCI,79.00,1,15.0,1.0,True,False,True,False,False,False,False,2,1,,0.5,0
4,34,2016-11-01,MCI,81.00,1,15.0,1.0,True,False,True,False,False,False,False,2,2,,1.5,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118562,999854,2021-07-07,HC,94.77,1,20.0,0.0,False,True,False,True,False,False,False,2,6,,0.0,156
118563,999872,2006-10-20,HC,71.72,1,14.0,1.0,False,False,False,True,False,False,False,1,1,29.0,0.5,0
118564,999872,2010-07-01,HC,75.41,1,14.0,1.0,False,False,False,True,True,False,False,2,2,30.0,0.0,44
118565,999872,2013-10-24,HC,78.73,1,14.0,1.0,False,False,False,True,False,False,False,1,3,29.0,0.0,84


In [58]:
# merged_clean[merged_clean['id']== 795843]
merged_clean = merged_clean[~merged_clean['id'].isin([347211, 795843])]

In [59]:
summary_df = med_utils.baseline_summary(merged_clean)
summary_df

Unnamed: 0,Measure,Value
0,Age at baseline (year),71.8 ± 9.9
1,Gender (Female),16225 (57.1%)
2,Education (year),15.6 ± 3.1
3,APOE4 (YES)**,9815 (34.5%)
4,Visits (Record),3.0 [2.0–5.0]
5,Follow up intervals (Month),2.0 – 221.0
6,CU at baseline,13888 (48.8%)
7,MCI at baseline,6435 (22.6%)
8,AD at baseline,8115 (28.5%)
9,Average medication taken at baseline,1.3 ± 1.3
