In [1]:
import pandas as pd
import importlib
import re
import med_utils
importlib.reload(med_utils)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
### Clinical
clinic_df = pd.read_csv("../raw_datasets/HABSHD/HD_Release_6_Clinical_FINAL.csv", low_memory=False)
clinic_df_clean = med_utils.clean_dataframe_advanced(clinic_df, missing_values=None, threshold=1, drop_rows_threshold=None, verbose=True)


Original shape: (5962, 1672)
Missing values replaced: [-777777, -777777.0, -9999.0, -9999, -999, -999.0, -8888, '-9999']
Columns dropped: 0 ([])
Final shape: (5962, 1672)


In [3]:
### APOE4
apoe_df = pd.read_csv("../raw_datasets/HABSHD/HD_Release_6_Genomics_FINAL.csv", low_memory=False)
apoe_df_clean = med_utils.clean_dataframe_advanced(apoe_df, missing_values=None, threshold=1, drop_rows_threshold=None, verbose=True)
# apoe_df_clean = apoe_df_clean.drop(['APOE4_rs429358', 'APOE4_rs7412', 'APOE4_Genotype'], axis=1)

Original shape: (3343, 10)
Missing values replaced: [-777777, -777777.0, -9999.0, -9999, -999, -999.0, -8888, '-9999']
Columns dropped: 0 ([])
Final shape: (3343, 10)


In [4]:
habs_df = med_utils.merge_dataframes(
        [clinic_df_clean, apoe_df_clean], 
        keys=['Visit_ID', 'Med_ID'],
        how='outer',
        handle_duplicates='auto',
        df_names=['clinic', 'apoe'],
        verbose=True
    )

Column conflicts detected:
  'Age' appears in: ['clinic', 'apoe']
  'Ethnicity' appears in: ['clinic', 'apoe']
  'ID_Gender' appears in: ['clinic', 'apoe']
  'ID_Education' appears in: ['clinic', 'apoe']

Analyzing 4 column conflicts...
  Kept 'Age' from first DataFrame (values identical)
  Kept 'Ethnicity' from first DataFrame (values identical)
  Kept 'ID_Gender' from first DataFrame (values identical)
  Kept 'ID_Education' from first DataFrame (values identical)

Merge completed:
Final DataFrame shape: (5962, 1676)
Merge key(s): ['Visit_ID', 'Med_ID']
Merge type: outer

Merge steps:
  Step 1: Added apoe (5962 -> 5962 rows, +4 columns)


In [5]:
medication_columns = [col for col in habs_df.columns if re.match(r"Medication_\d+_Name", col)]
habs_df_selected = med_utils.rename_and_select_columns(habs_df, "../raw_datasets/naming_convension.xlsx", "HABS_HD", "New_Name", extra_cols=medication_columns)

In [6]:
# 0: Male; 1: female;
print("The dimension of the selected HABS dataframe is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
habs_df_selected['status'] = habs_df_selected['status'].map({0: 'HC', 1: 'MCI', 2: 'AD', 9: 'Undetermine',})
habs_df_selected = habs_df_selected[habs_df_selected['status'].isin(['HC', 'MCI', 'AD'])]
print("The dimension of the selected HABS dataframe after drop unrelated with AD is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
habs_df_selected = med_utils.clean_status_and_demos(habs_df_selected)
habs_df_selected = habs_df_selected.dropna(subset=['status'])
habs_df_selected = habs_df_selected[~(habs_df_selected['MMSE'].isna() & habs_df_selected['CDR'].isna())]
habs_df_selected = habs_df_selected.reset_index(drop=True)
print("The dimension of the selected NACC dataframe is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
med_df = habs_df_selected[medication_columns]
selected_columns = habs_df_selected.columns.tolist()

The dimension of the selected HABS dataframe is: (5962, 35) with 3840 unique participants.
The dimension of the selected HABS dataframe after drop unrelated with AD is: (5944, 35) with 3828 unique participants.
Filled 0 missing status records; columns forward/back filled: ['sex', 'edu', 'APOE4']
The dimension of the selected NACC dataframe is: (5944, 35) with 3828 unique participants.


### angiotensin-converting enzyme inhibitors (ACEi)

In [7]:
acei_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ACEi')
acei_names = list(set(acei_files['Drug_name'].dropna()).union(acei_files['Brand_name'].dropna()))

In [8]:
acei_matched_values = med_utils.get_matched_values(med_df, acei_names)

In [9]:
acei_matched_values

['Enalapril HCTZ',
 'lisinopril 20/12.5 mg',
 'Enalapril maleate',
 'Amlodipine/Benazepril 10-20mg',
 'Lisinioril',
 'ENALAPRIL MALEATE',
 'Enalapril ',
 'lisinopril hydrochlorothiazide',
 'Ramipril',
 'Zestril (lisinopril)',
 'Benezepril',
 'Lisinporil',
 'Benazapril',
 'Benazepril',
 'lotensin',
 'Lysinopril',
 'EPA',
 'Altace',
 'ala',
 'Amlodipine - Benazepril',
 'LISINIPRIL',
 'Zestril',
 'Lisinopri',
 'Lisinopril - BP medication',
 'Lisiuopril',
 'prinivil',
 'Ramipril HBP',
 'Lisinopril Hydrochlorothyazide',
 'Benazepril ',
 'Lisinoptil',
 'Hydrochlorothiazide Quinapril',
 'D',
 'ENALAPRIL',
 'Ace',
 'Lesinopril',
 'lisinopril/HCTZ',
 'Benazepril HCTZ',
 'Lisinopril/ HCTZ 25mg',
 'Lisonopril',
 'LISINOPRIL-HCTZ',
 'AP',
 'lisinopril',
 'Benaz',
 'Amlodipine/Benazepril',
 'Lisinopril 20-12.5 mg HCTZ',
 'Benazepril HCL',
 'zestril',
 'lisinopril-HCTZ 20-12.5mg',
 'LISINOPRIL HCTZ',
 'Benazepril/Hctz',
 'Benazerpril',
 'Benazepril Hydrochloride',
 'Lisinopril HCTZ 20-25mg',
 'E',
 

In [10]:
values_to_drop_aceinames = {'Ca', 'C', 'Ace', 'E', 'ALA', 'D', 'AP', 'ala', 'EPA', 'Fe', }
acei_matched_values = [x for x in acei_matched_values if x not in values_to_drop_aceinames]

In [11]:
acei_all = med_utils.medication_all_timepoints(
    habs_df_selected, acei_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# acei_all.to_csv("../preprocessed_data/HABSHD/acei_all_results_HABS_missing.csv",index=False)

### angiotensin receptor blockers (ARBs)

In [12]:
arb_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ARB')
arb_names = list(set(arb_files['Drug_name'].dropna()).union(arb_files['Brand_name'].dropna()))

In [13]:
arb_matched_values = med_utils.get_matched_values(med_df, arb_names)

In [14]:
arb_matched_values

['Irbesartan (Oral Pill)',
 'Telmisartan HCTZ',
 'Valsartan  Hydrochlorothiazide',
 'Losartan Tab',
 'Travaprost Z',
 'telmisartan-hydrochlorothiazide',
 'Olmesartan Mexodomil',
 'Losartan/ HCTZ',
 'Diovan Valstartan',
 'Losaratan',
 'Losartan-HCTR',
 'Losartan HCTZ',
 'Losartan HCL',
 'Losartan POTASSIUM',
 'Azilsartan medoxomil',
 'Olmesartan Medixomil',
 'Telmisartan 80 mg hydrochlorothiazide 25 mg',
 'Losrtan',
 'Losartan Potasium',
 'Amlodipine/ Olmesartan',
 'amlodipine/valsartan',
 'valsrtan',
 'Losartan Potasico',
 'Losartan-HCTZ',
 'VALSARTAN',
 'LOSARTAN',
 'Micardis Telmisartan',
 'Olmesartan Hctz 20-12.5',
 'losartanhydrochlorothiazide ',
 'Valsartan HCTZ',
 'Valsartan-HCTZ',
 'LOSARTAN/HCTZ',
 'Candesartan Cilexetil',
 'Losartan  HCTZ 50',
 'Valsartan/HCTZ ',
 'Irbesartan HCL ',
 'Telmisartan',
 'Losartan POT',
 'Diovan',
 'Valsartan/HCTZ',
 'D',
 'Olmesartan HcTz 40-25 mg',
 'Olmersatan',
 'Losartan/Hctz',
 'Losartan Pot',
 'diovan',
 'losartan htc',
 'Olmesartan Medoxomi

In [15]:
values_to_drop_arbnames = {'asa', 'Travaprost', 'Ca', 'C', 'E', 'Travaprost Z', 'D', 'AP', 'ASA', }
arb_matched_values = [x for x in arb_matched_values if x not in values_to_drop_arbnames]

In [16]:
arb_all = med_utils.medication_all_timepoints(
    habs_df_selected, arb_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# arb_all.to_csv("../preprocessed_data/HABSHD/arb_all_results_HABS_missing.csv",index=False)

### Beta Blocker

In [17]:
beta_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='beta_blockers')
beta_blockers_names = list(set(beta_blockers_files['Drug_name'].dropna()).union(beta_blockers_files['Brand_name'].dropna()))

In [18]:
beta_blockers_matched_values = med_utils.get_matched_values(med_df, beta_blockers_names)

In [19]:
beta_blockers_matched_values

['carvedilol',
 'Metoprolol Succinate ER',
 'Caruedilol',
 'Bisoprolo',
 'nebivolol',
 'Metoprolol Succinate',
 'Metoprolol succ ER',
 'Metoprolol- Tartrate',
 'metoprolol ER',
 'Labetalol',
 'Bisoprolol fumarate',
 'atenelol',
 'Atenolol ',
 'Metoprolol tart',
 'METOPROLOL',
 'Propranolol',
 'Metoprolol Tartarate',
 'sotalol',
 'Timolol Hemihydrate ',
 'Troprol Xl',
 'TIMOLOL MALEATE',
 'Dorzolamide Hydrochloride and Timolol Maleate 0.5%',
 'bisoprolol fumarate',
 'Metaprolol',
 'Metoprolol Succ',
 'Metoprolol Tartrate ',
 'Metoprolol  Succ ER',
 'Ziac/ Bisoprolol / HCTZ',
 'DORZOLAMIDE/TIMOLOL',
 'Sotalol',
 'Sotalol AF',
 'CarvediloL',
 'Metoprolol ER',
 'Bisprolol',
 'Bisoprolol-HCTZ 5',
 'Metoprolol/Tartrate',
 'Propranonol',
 'Atenolol',
 'metolprolol',
 'Metoprolol tartrate',
 'metoprolol Succinate ER',
 'Co Spot (dorzolamide/timolol)',
 'Timolol Hemihydrate 0.25%',
 'Timolol',
 'Bisoprolol/HCTZ',
 'Metopolol',
 'Dorzolamide Timolol',
 'metoprolol tartrate',
 'D',
 'Metoprolol S

In [20]:
values_to_drop_BetaBlknames = {'Ca', 'C', 'Dorzolamide CITimolol', 'Ace', 'E', 'D', 'AP'}
beta_blockers_matched_values = [x for x in beta_blockers_matched_values if x not in values_to_drop_BetaBlknames]

In [21]:
beta_blockers_all = med_utils.medication_all_timepoints(
    habs_df_selected, beta_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# beta_blockers_all.to_csv("../preprocessed_data/HABSHD/beta_blockers_all_results_HABS_missing.csv",index=False)

### Calcium Channel Blockers

In [22]:
Ca_channel_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='Ca_channel_blockers')
Ca_channel_blockers_names = list(set(Ca_channel_blockers_files['Drug_name'].dropna()).union(Ca_channel_blockers_files['Brand_name'].dropna()))

In [23]:
Ca_channel_blockers_matched_values = med_utils.get_matched_values(med_df, Ca_channel_blockers_names)

In [24]:
Ca_channel_blockers_matched_values

['Amlodepine',
 'amalodipine',
 'nifedipine ER',
 'verapamil',
 'Amlodipine/Benazepril 10-20mg',
 'amlodipjne',
 'Norvac',
 'Norvasc',
 'Nifedipine Er',
 'Verapamil',
 'Nifedipine XL',
 'AMLODIPINE',
 'Amlodipine Besytlate',
 'AMLOD',
 'NORVASC',
 'Norvasc generic',
 'Felodipine',
 'Nifedapine',
 'Amlodipine/ Olmesartan',
 'Amlodipine Bestlate',
 'verapamil ER',
 'amlodipine/valsartan',
 'Amlodipine Besylate (Oral Pill)',
 'ala',
 'Amlodipine - Benazepril',
 'Nifedipine',
 'Amlodipine besylate  ',
 'Amlodipine besylarte',
 'Amlodipene',
 'Nifedipine (Extended Release)',
 'Amlodipine tab',
 'D',
 'Amlodipine/Besylate',
 'felodipine ER',
 'Amlodipine-besylate',
 'Verapamil ER',
 'norvasc',
 'AP',
 'AMLODIPINE-OLMESARTAN',
 'amlodepine',
 'Amlodipine/Benazepril',
 'amlodipine besylate',
 'Amdlodipine',
 'Nifedipine ER',
 'diltiazem',
 'Telmisartan Amlodipine',
 'Calcium',
 'Verapamil HCL',
 'AMLODIPINE BESYLATE',
 'NIFEdipine',
 'Amilodipine',
 'Amlodopine',
 'amlodipine Besylate',
 'Olme

In [25]:
values_to_drop_CCBnames = {'CALCIUM', 'Ca', 'C', 'Calcium', 'Calcium ', 'E', 'ALA', 'D', 'AP', 'calcium', 'ala', 'NAC',
                           'Fe', 'Calcium +'}
Ca_channel_blockers_matched_values = [x for x in Ca_channel_blockers_matched_values if x not in values_to_drop_CCBnames]

In [26]:
Ca_channel_blockers_all = med_utils.medication_all_timepoints(
    habs_df_selected, Ca_channel_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# Ca_channel_blockers_all.to_csv("../preprocessed_data/HABSHD/Ca_channel_blockers_all_results_HABS_missing.csv",index=False)

### Diuretics

In [27]:
diuretics_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='diuretics')
diuretics_names = list(set(diuretics_files['Drug_name'].dropna()).union(diuretics_files['Brand_name'].dropna()))

In [28]:
diuretics_matched_values = med_utils.get_matched_values(med_df, diuretics_names)

In [29]:
diuretics_matched_values

['spironolactone',
 'Valsartan  Hydrochlorothiazide',
 'telmisartan-hydrochlorothiazide',
 'Spinolactone',
 'Furosemide Lasix',
 'Dyazide triamterene hydrochlorothiazide\t',
 'FUROSEMIDE',
 'HYDROCHLOROTHIAZIDE',
 'lasix',
 'Bumetanide',
 'Hydrocholorothiazide',
 'Spironolacto',
 'lisinopril hydrochlorothiazide',
 'Telmisartan 80 mg hydrochlorothiazide 25 mg',
 'Lasix ',
 'potassium ',
 'Furosemide',
 'Spironolcatone',
 'Furisemide',
 'Hydrochlorothiazide DE',
 'Eplerenone',
 'Chlorthlidone',
 'hydrochlorithiazide',
 'Acetazolamide',
 'Hydochlorothiazide',
 'hydroCHLOROthiazide',
 'Hydrochlorothiazied',
 'Potassium ',
 'Hydrochlorothizide',
 'losartanhydrochlorothiazide ',
 'Hydrocholorothizide',
 'EPLERENONE',
 'Torsemide',
 'Hydrochlorothiazide Quinapril',
 'triamterene',
 'Sprionolactone',
 'indapamide',
 'D',
 'chlorthalidone',
 'Triametrene',
 'Hydroclorothiazide',
 'Ace',
 'Valsartan Hydrochlorothiazide',
 'Hydrochlorothiazide metoprolol tartrate',
 'AP',
 'bumetanide',
 'Losarta

In [30]:
values_to_drop_diureticsnames = {'Ca', 'C', 'IRON', 'Ace', 'E', 'POTASSIUM', 'potassium', 'D', 'AP', 'Potassium',
                                 'Potassium ', 'ACET', 'Iron', 'Iron ', 'potassium ', 'Amox', 'iron'}
diuretics_matched_values = [x for x in diuretics_matched_values if x not in values_to_drop_diureticsnames]

In [31]:
diuretics_all = med_utils.medication_all_timepoints(
    habs_df_selected, diuretics_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# diuretics_all.to_csv("../preprocessed_data/HABSHD/diuretics_all_results_HABS_missing.csv",index=False)

### statins 

In [32]:
statin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='statin')
statin_names = list(set(statin_files['Drug_name'].dropna()).union(statin_files['Brand_name'].dropna()))

In [33]:
statin_matched_values = med_utils.get_matched_values(med_df, statin_names)

In [34]:
statin_matched_values

['Rosuvastin',
 'Artovastatin',
 'Rosuvastatin Tabs',
 'Simvastating',
 'Atorvastatina',
 'Atorvastatina (Atorvastatin)',
 'pravatatin',
 'atorvastatin (Lipitor)',
 'Atorvistatin',
 'Rosuvastain',
 'LIPITOR',
 ' Atorvastatin ',
 'lovastatin',
 'Astorvatatin',
 'Rosuvastatin',
 'atorvastatin ',
 'statin',
 'Crestor (Generic Drug: Rosuvastatin)',
 'Atorvastatin Calcium',
 'Sinvastatin',
 'Crestor Rosuvastatin Calcium',
 'pravastatin SOD',
 'Rosuvastatine Calcium',
 'Atorvastatin Calcium ',
 'Astorvastattin',
 'ATORVASTATIN CALCIUM',
 'Pravastatin Na',
 'Provastatin',
 'Rosuvastatin Calcium',
 'Atorvastatin calcium',
 'Pravastatin Sodium  ',
 'lipitor ',
 'LOVASTATIN',
 'ROSUVASTATIN CAL',
 'Pravastatin sodium',
 'zocor',
 'PRAVASTATIN',
 'atorvastafin',
 'Liitor',
 'Crestor',
 'Atorvastatin Ca',
 'Simvastatin ',
 'Atotvastatin',
 'Livalo',
 'atorvastatin (LIPITOR)',
 'Rosuvastatin Ca + (Crestor)',
 'Rouvastatin',
 'ROSUVATASTIN',
 'ROSUVASTATIN CA',
 'ezetimibe simvastatin',
 'Atorvastat

In [35]:
values_to_drop_statinnames = {'C', 'E'}
statin_matched_values = [x for x in statin_matched_values if x not in values_to_drop_statinnames]

In [36]:
statin_all = med_utils.medication_all_timepoints(
    habs_df_selected, statin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# statin_all.to_csv("../preprocessed_data/HABSHD/statin_all_results_HABS_missing.csv",index=False)

### metformin

In [37]:
metformin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='metformin')
metformin_names = list(set(metformin_files['Drug_name'].dropna()).union(metformin_files['Brand_name'].dropna()))

In [38]:
metformin_matched_values = med_utils.get_matched_values(med_df, metformin_names)

In [39]:
metformin_matched_values

['Metformin HCL er',
 'METFORMIN HCL',
 'glipizide-metformin',
 'Metformin HCL',
 'metformin HCL ER',
 'Metformin HCL ER',
 'METFORMIN',
 'C',
 'Metformin Hcl',
 'metformin ',
 'Metformin hcl',
 'Metformin HCL ',
 'Metformin HCl ER',
 'Janumet XR (sitagliptin & metformin HCL)',
 'metformin',
 'Metformin',
 'metFORMIN XR',
 'METAFORMIN',
 'Sitagliptin and Metformin HCl',
 'METFORMIN HCL XR',
 'Metaformin',
 'Metformin XR',
 'D',
 'Metformin er',
 'Metformin HCI',
 'Metformin HcI',
 'metformin XL',
 'Metformin ',
 'Pioglitazone Metformin',
 'glucophage',
 'Metformin Extended Release',
 'Glucophage Metformin',
 'metformin ER',
 'Metformin Hydrochloride',
 'Metformin ER',
 'metFORMIN Hcl',
 'metformin hydrochloride',
 'metformin HCL',
 'metformin time release',
 'metFormin',
 'Glipizide Metformin',
 'Meformin',
 'Metformin/ hydrochloride',
 'Glyburide-Metformin',
 'Kombiglyze XR',
 'Glucophage XR',
 'metformin (Glucophage XR)',
 'metformin hcl',
 'Metformin HCl',
 'Pioglitazone-Metformin',

In [40]:
values_to_drop_metforminnames = {'E', 'C', 'D'}
metformin_matched_values = [x for x in metformin_matched_values if x not in values_to_drop_metforminnames]

In [41]:
metformin_all = med_utils.medication_all_timepoints(
    habs_df_selected, metformin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# metformin_all.to_csv("../preprocessed_data/HABSHD/metformin_all_results_HABS_missing.csv",index=False)

### control group
never have any of the 5 type of medications

In [42]:
drug_lists = [
    acei_matched_values,
    arb_matched_values,
    beta_blockers_matched_values,
    Ca_channel_blockers_matched_values,
    diuretics_matched_values,
    statin_matched_values,
    metformin_matched_values
]

all_target_drugs = set().union(*drug_lists)

In [43]:
row_has_target_drug = (
    habs_df_selected[medication_columns]
      .applymap(lambda x: med_utils.contains_any_medication_type(x, all_target_drugs))
      .any(axis=1)
)
ids_with_any_target_drug = habs_df_selected.loc[row_has_target_drug, 'id'].unique()
controls_df = habs_df_selected[~habs_df_selected['id'].isin(ids_with_any_target_drug)].copy()


In [44]:
controls_df["Has_Medication_This_Visit"] = False
controls_df = controls_df.drop(columns=[c for c in medication_columns if c in controls_df.columns])
# controls_df.to_csv("../preprocessed_data/HABSHD/controls_df_HABS_missing.csv",index=False)

### Adjust for the later modelling in R

In [45]:
datasets = {
    "ACEi": acei_all,
    "ARB": arb_all,
    "BetaBlk": beta_blockers_all,
    "CCB": Ca_channel_blockers_all,
    "Diuretic": diuretics_all,
    "Statin": statin_all,
    "Metformin": metformin_all,
    "Control": controls_df  
}

In [46]:
merged_df = med_utils.merge_medication_longitudinal(datasets, control_key="Control", extra_cols=["visit_no", "MMSE", "CDR"])

In [47]:
print(f"Initial rows: {len(merged_df)}")
merged_clean = merged_df.dropna(subset=['status'])
merged_clean = merged_clean[~(merged_clean['MMSE'].isna() & merged_clean['CDR'].isna())]
print(f"After dropping missings: {len(merged_clean)}")
merged_clean = merged_clean[merged_clean.groupby('id')['visit_no'].transform('nunique') > 1]
print(f"After removing number of records: {len(merged_clean)}")

Initial rows: 5931
After dropping missings: 5931
After removing number of records: 3537


In [48]:
merged_clean['visit_date'].isna().sum()

0

In [49]:
merged_clean['visit_date'] = pd.to_datetime(merged_clean['visit_date'])
merged_clean['months_since_baseline'] = (
    merged_clean.groupby('id')['visit_date']
      .transform(lambda x: (x - x.min()) / pd.Timedelta(days=30.4375))
      .round()
    .astype(int)
)

In [50]:
merged_clean = med_utils.clean_and_filter_participants(
    merged_clean,
    id_col='id',
    class_col ='status',
    time_col = 'visit_no',
    medication_ever_col=None,
    min_visits=2,
    if_print=True
)

Remaining participants: 1,425
Dropped participants:   0


In [51]:
merged_clean[merged_clean.duplicated(subset=['id', 'visit_no'], keep=False)].sort_values(['id','visit_no'])

Unnamed: 0,id,visit_date,status,age,sex,edu,APOE4,ACEi,ARB,BetaBlk,CCB,Diuretic,Statin,Metformin,Total_Meds,visit_no,MMSE,CDR,months_since_baseline


In [52]:
merged_clean = med_utils.clean_longitudinal(merged_clean)

In [53]:
merged_clean.to_csv("../preprocessed_data/HABSHD/HABSHD_merge_full_missing.csv",index=False)

In [54]:
max_tab = merged_clean.groupby('id')['Total_Meds'].max().value_counts()
print("\nDistribution of max Total_Meds per participant:\n", max_tab)



Distribution of max Total_Meds per participant:
 Total_Meds
0    368
1    336
2    295
3    260
4    125
5     36
6      5
Name: count, dtype: int64


In [55]:
summary_df = med_utils.baseline_summary(merged_clean)
summary_df

Unnamed: 0,Measure,Value
0,Age at baseline (year),65.7 ± 8.4
1,Gender (Female),909 (63.8%)
2,Education (year),13.3 ± 4.3
3,APOE4 (YES)**,351 (24.6%)
4,Visits (Record),2.0 [2.0–3.0]
5,Follow up intervals (Month),16.0 – 86.0
6,CU at baseline,1139 (79.9%)
7,MCI at baseline,226 (15.9%)
8,AD at baseline,60 (4.2%)
9,Average medication taken at baseline,1.3 ± 1.3
