In [17]:
import pandas as pd
import importlib
import re
import med_utils
importlib.reload(med_utils)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
### Clinical
clinic_df = pd.read_csv("../raw_datasets/HABSHD/HD_Release_6_Clinical_FINAL.csv", low_memory=False)
clinic_df_clean = med_utils.clean_dataframe_advanced(clinic_df, missing_values=None, threshold=1, drop_rows_threshold=None, verbose=True)


Original shape: (5962, 1672)
Missing values replaced: [-777777, -777777.0, -9999.0, -9999, -999, -999.0, -8888, '-9999']
Columns dropped: 0 ([])
Final shape: (5962, 1672)


In [3]:
### APOE4
apoe_df = pd.read_csv("../raw_datasets/HABSHD/HD_Release_6_Genomics_FINAL.csv", low_memory=False)
apoe_df_clean = med_utils.clean_dataframe_advanced(apoe_df, missing_values=None, threshold=1, drop_rows_threshold=None, verbose=True)
# apoe_df_clean = apoe_df_clean.drop(['APOE4_rs429358', 'APOE4_rs7412', 'APOE4_Genotype'], axis=1)

Original shape: (3343, 10)
Missing values replaced: [-777777, -777777.0, -9999.0, -9999, -999, -999.0, -8888, '-9999']
Columns dropped: 0 ([])
Final shape: (3343, 10)


In [4]:
### PET-SUVR
suvr_df = pd.read_csv("../raw_datasets/HABSHD/HD_Release_6_Genomics_FINAL.csv", low_memory=False)
apoe_df_clean = med_utils.clean_dataframe_advanced(apoe_df, missing_values=None, threshold=1, drop_rows_threshold=None, verbose=True)
# apoe_df_clean = apoe_df_clean.drop(['APOE4_rs429358', 'APOE4_rs7412', 'APOE4_Genotype'], axis=1)

Original shape: (3343, 10)
Missing values replaced: [-777777, -777777.0, -9999.0, -9999, -999, -999.0, -8888, '-9999']
Columns dropped: 0 ([])
Final shape: (3343, 10)


In [5]:
habs_df = med_utils.merge_dataframes(
        [clinic_df_clean, apoe_df_clean], 
        keys=['Visit_ID', 'Med_ID'],
        how='outer',
        handle_duplicates='auto',
        df_names=['clinic', 'apoe'],
        verbose=True
    )

Column conflicts detected:
  'Age' appears in: ['clinic', 'apoe']
  'Ethnicity' appears in: ['clinic', 'apoe']
  'ID_Gender' appears in: ['clinic', 'apoe']
  'ID_Education' appears in: ['clinic', 'apoe']

Analyzing 4 column conflicts...
  Kept 'Age' from first DataFrame (values identical)
  Kept 'Ethnicity' from first DataFrame (values identical)
  Kept 'ID_Gender' from first DataFrame (values identical)
  Kept 'ID_Education' from first DataFrame (values identical)

Merge completed:
Final DataFrame shape: (5962, 1676)
Merge key(s): ['Visit_ID', 'Med_ID']
Merge type: outer

Merge steps:
  Step 1: Added apoe (5962 -> 5962 rows, +4 columns)


In [6]:
medication_columns = [col for col in habs_df.columns if re.match(r"Medication_\d+_Name", col)]
habs_df_selected = med_utils.rename_and_select_columns(habs_df, "../raw_datasets/naming_convension.xlsx", "HABS_HD", "New_Name", extra_cols=medication_columns)

In [7]:
# 0: Male; 1: female;
print("The dimension of the selected HABS dataframe is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
habs_df_selected['status'] = habs_df_selected['status'].map({0: 'HC', 1: 'MCI', 2: 'AD', 9: 'Undetermine',})
habs_df_selected = habs_df_selected[habs_df_selected['status'].isin(['HC', 'MCI', 'AD'])]
print("The dimension of the selected HABS dataframe after drop unrelated with AD is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
habs_df_selected = med_utils.clean_status_and_demos(habs_df_selected)
habs_df_selected = habs_df_selected.dropna(subset=['status'])
habs_df_selected = habs_df_selected[~(habs_df_selected['MMSE'].isna() & habs_df_selected['CDR'].isna())]
habs_df_selected = habs_df_selected.reset_index(drop=True)
print("The dimension of the selected NACC dataframe is:", habs_df_selected.shape, "with", habs_df_selected['id'].nunique(), "unique participants.")
med_df = habs_df_selected[medication_columns]
selected_columns = habs_df_selected.columns.tolist()

The dimension of the selected HABS dataframe is: (5962, 35) with 3840 unique participants.
The dimension of the selected HABS dataframe after drop unrelated with AD is: (5944, 35) with 3828 unique participants.
Filled 0 missing status records; columns forward/back filled: ['sex', 'edu', 'APOE4']
The dimension of the selected NACC dataframe is: (5944, 35) with 3828 unique participants.


### angiotensin-converting enzyme inhibitors (ACEi)

In [8]:
acei_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ACEi')
acei_names = list(set(acei_files['Drug_name'].dropna()).union(acei_files['Brand_name'].dropna()))

In [9]:
acei_matched_values = med_utils.get_matched_values(med_df, acei_names)

In [10]:
acei_matched_values

['Benazepril HCL',
 'Fe',
 'Zestril',
 'ENALAPRIL MALEATE',
 'E',
 'Lizinopril',
 'lisinorpil',
 'Fosinopril',
 'lisinoprilhydrochlorothiazide',
 'Amlodipine - Benazepril',
 'Enalapril ',
 'Benezepril',
 'Benazapril',
 'lisinopril/HCTZ',
 'Lesinopril',
 'D',
 'Benazepril/Hctz',
 'zestril',
 'Lisinopril HTCZ',
 'Lisinopril Hydrochlorothyazide',
 'Lisinopril 20-12.5 mg HCTZ',
 'lisinopril 20/12.5 mg',
 'quinapril',
 'Benazepril HCl',
 'Ramipril HBP',
 'Enalapril HCTZ',
 'Benazerpril',
 'Amlodipine/Benazepril',
 'Altace',
 'Lisinopril HCTZ 2012.5mg',
 'Amlodipine/Benazepril 10-20mg',
 'Lisinopril/HCTZ',
 'LisinoprilhydroChlorothiazide',
 'Prinivil',
 'Lisinopril',
 'Lysinopril',
 'Lisinopril HCTZ 20-25mg',
 'LISINIPRIL',
 'lisinopril',
 'Amlodipine Benazepril',
 'lisinopril hydrochlorothiazide',
 'Ramipril',
 'Fosinopril Sodium',
 'Lisinopri',
 'Lisinopril Hctz',
 'Lisionpril',
 'prinivil',
 'Hydrochlorothiazide Quinapril',
 'Lotensin',
 'Lisinioril',
 'lisnopril',
 'Quinapril',
 'Lisinop

In [11]:
values_to_drop_aceinames = {'Ca', 'C', 'Ace', 'E', 'ALA', 'D', 'AP', 'ala', 'EPA', 'Fe', }
acei_matched_values = [x for x in acei_matched_values if x not in values_to_drop_aceinames]

In [12]:
acei_all = med_utils.medication_all_timepoints(
    habs_df_selected, acei_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# acei_all.to_csv("../preprocessed_data/HABSHD/acei_all_results_HABS_missing.csv",index=False)

### angiotensin receptor blockers (ARBs)

In [13]:
arb_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ARB')
arb_names = list(set(arb_files['Drug_name'].dropna()).union(arb_files['Brand_name'].dropna()))

In [14]:
arb_matched_values = med_utils.get_matched_values(med_df, arb_names)

In [15]:
arb_matched_values

['valsartan',
 'Telmisartan/Hydrochlorothiazide 80-25mg',
 'Azilsartan Medoxomin',
 'E',
 'TELMIS',
 'Irbesartan ',
 'valsrtan',
 'Olmesartan',
 'OLMESARTAN MEDOXOMIL',
 'Benicar ',
 'Diovan HCL',
 'Edarbi',
 'Candesartin',
 'Losartan HCTZ 100',
 'LOSARTAN HCTZ',
 'D',
 'losartan HCT',
 'losartan potassium',
 'olmesartan medox/hctz',
 'Olmersatan',
 'Candesartan',
 'Losartan HCTZ',
 'Micardis Telmisartan',
 'Losartan',
 'Losartan Potassium & Hydrochlorothiazide',
 'Losartan-hydrochlorothiazide',
 'MiCardis (Telmesarten)',
 'Losartan Potassium HCTZ',
 'Losartan Potassium ',
 'Losartan 100/120',
 'Olmesartan / Amlodipine',
 'Benicar HCT',
 'Losartan Hydrochlorothiazide',
 'LORSARTAN',
 'Atacand Tabs',
 'Micardis HCT',
 'Losartan/HCTZ',
 'Olmesartan / Hydrochlorothiazide',
 'Olimesartan',
 'Olmesarton',
 'Losartan potassium',
 'TELMISARTAN',
 'telmisartan-hydrochlorothiazide',
 'Losartan Potassium',
 'Olmesartan Medoxomil',
 'Olmesartan Medox/HCTZ',
 'Telmisartan HCTZ',
 'Losartan/ HCTZ',

In [16]:
values_to_drop_arbnames = {'asa', 'Travaprost', 'Ca', 'C', 'E', 'Travaprost Z', 'D', 'AP', 'ASA', }
arb_matched_values = [x for x in arb_matched_values if x not in values_to_drop_arbnames]

In [17]:
arb_all = med_utils.medication_all_timepoints(
    habs_df_selected, arb_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# arb_all.to_csv("../preprocessed_data/HABSHD/arb_all_results_HABS_missing.csv",index=False)

### Beta Blocker

In [18]:
beta_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='beta_blockers')
beta_blockers_names = list(set(beta_blockers_files['Drug_name'].dropna()).union(beta_blockers_files['Brand_name'].dropna()))

In [19]:
beta_blockers_matched_values = med_utils.get_matched_values(med_df, beta_blockers_names)

In [20]:
beta_blockers_matched_values

['atenolol',
 'labetalol',
 'E',
 'Atenolol',
 'metolprolol',
 'Sotalol HCL',
 'Timolol Maleate (eye drops)',
 'nebivolol',
 'metropolol',
 'Dorzolamide Hydrochloride and Timolol Maleate 0.5%',
 'Metoprolol Succ ER',
 'propranolol',
 'Timolol Maleate Ophthalmic Solution USP',
 'metoprolol',
 'D',
 'Timolol GFS .5%',
 'Carvedilol ',
 'Labetalol HCL',
 'Atenolol ',
 'Timolol Maleate',
 'Bisopropol',
 'Bystolic',
 'metoprolol ER',
 'Carvedlol',
 'carvedilol',
 'Ziac/ Bisoprolol / HCTZ',
 'Metoprolol ER (succ)',
 'Nebivolol',
 'DORZOLAMIDE/TIMOLOL',
 'Dorzolamide Opth Solution and Timolol (6.8mg/mL)',
 'Metolprolol',
 'METOPROLOL',
 'Metoprolol ER Succinate',
 'Metoprolol Tartrate ',
 'Metoprolol succER',
 'Metoprolol Succinate XL',
 'Bisoprolol-HCTZ 5',
 'Bisoprolo',
 'Metoprolol & diet manager prod',
 'timolol eye drops',
 'Bisoprolol Fumarate',
 'METROPROLOL',
 'Bisoprolol fumarate',
 'Lopressor',
 'Sotalol Hcl',
 'Troprol Xl',
 'Sotalol AF',
 'meoprolol',
 'timolol maleate 0.5%',
 'Hyd

In [21]:
values_to_drop_BetaBlknames = {'Ca', 'C', 'Dorzolamide CITimolol', 'Ace', 'E', 'D', 'AP'}
beta_blockers_matched_values = [x for x in beta_blockers_matched_values if x not in values_to_drop_BetaBlknames]

In [22]:
beta_blockers_all = med_utils.medication_all_timepoints(
    habs_df_selected, beta_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# beta_blockers_all.to_csv("../preprocessed_data/HABSHD/beta_blockers_all_results_HABS_missing.csv",index=False)

### Calcium Channel Blockers

In [23]:
Ca_channel_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='Ca_channel_blockers')
Ca_channel_blockers_names = list(set(Ca_channel_blockers_files['Drug_name'].dropna()).union(Ca_channel_blockers_files['Brand_name'].dropna()))

In [24]:
Ca_channel_blockers_matched_values = med_utils.get_matched_values(med_df, Ca_channel_blockers_names)

In [25]:
Ca_channel_blockers_matched_values

['Fe',
 'Amlodipine  Besylate',
 'Amilodipine',
 'E',
 'Cardizem Caps',
 'AMLODIPINE BESYLATE',
 'Amlod',
 'Amlodipine Bestlate',
 'Amlodipine - Benazepril',
 'Amlodipine tab',
 'amLODIPIne',
 'D',
 'Amlodypine',
 'dilTIAZem',
 'CALCIUM',
 'Norvas',
 'Nifedapine',
 'almodipine',
 'Amolodipine',
 'ditiazem',
 'Amlodipine (Norvasc)',
 'Amlodepine',
 'Amlodipine besylate tablets',
 'Amlodapine',
 'Norvasc generic',
 'Olmesartan / Amlodipine',
 'amLodipine',
 'amLODIPine (NORVASC)',
 'amlodepine',
 'Amdlodipine',
 'amlodipjne',
 'Amlodipine/Benazepril',
 'Tiazac ER',
 'Amlodipine/Benazepril 10-20mg',
 'Amlodipine besylarte',
 'diltiazem',
 'Calcium ',
 'Nifedipine',
 'AmLodipine',
 'NIFEdipine',
 'Amlodipine besylate',
 'Amlodipine Benazepril',
 'Nifedipine XL',
 'Amlodipine Tab',
 'nifedipine ER',
 'Calcium',
 'Felodipine',
 'verapamil',
 'procardia xl',
 'Calcium +',
 'Amlodipine Besytlate',
 'Cardizem',
 'Amlodipine BESYLATE',
 'Verapamil SR',
 'NAC',
 'Amlodipine/Besylate',
 'Amlodipin

In [26]:
values_to_drop_CCBnames = {'CALCIUM', 'Ca', 'C', 'Calcium', 'Calcium ', 'E', 'ALA', 'D', 'AP', 'calcium', 'ala', 'NAC',
                           'Fe', 'Calcium +'}
Ca_channel_blockers_matched_values = [x for x in Ca_channel_blockers_matched_values if x not in values_to_drop_CCBnames]

In [27]:
Ca_channel_blockers_all = med_utils.medication_all_timepoints(
    habs_df_selected, Ca_channel_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# Ca_channel_blockers_all.to_csv("../preprocessed_data/HABSHD/Ca_channel_blockers_all_results_HABS_missing.csv",index=False)

### Diuretics

In [28]:
diuretics_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='diuretics')
diuretics_names = list(set(diuretics_files['Drug_name'].dropna()).union(diuretics_files['Brand_name'].dropna()))

In [29]:
diuretics_matched_values = med_utils.get_matched_values(med_df, diuretics_names)

In [30]:
diuretics_matched_values

['triamterene',
 'Triamterene ',
 'Telmisartan/Hydrochlorothiazide 80-25mg',
 'E',
 'Triamterene HCTZ',
 'Spironalactone',
 'lisinoprilhydrochlorothiazide',
 'Atenolol and Chlorthalidone',
 'Acetazolamide',
 'Espironolactona',
 'Triamt',
 'D',
 'FUROSEMIDE',
 'Furosemine',
 'Sprionolactone',
 'Hydrochorthiazide',
 'hydroCHLOROthiazide',
 'Triamterene/HCTZ',
 'Hydrochlorothizide',
 'SPironolactone',
 'Losartan Potassium & Hydrochlorothiazide',
 'Losartan-hydrochlorothiazide',
 'Hydrocholorothizide',
 'Furosemide (Lasix)',
 'Termisartan/Hydrochlorothiazide (MICARDES HCL) 80/20',
 'Triampterene',
 'Losartan Hydrochlorothiazide',
 'potassium',
 'EPLERENONE',
 'Triamterene-HCTZ',
 'Amilorid',
 'Olmesartan / Hydrochlorothiazide',
 'hydrochlorithiazide',
 'torsemide',
 'telmisartan-hydrochlorothiazide',
 'POTASSIUM',
 'Hydrochlorothialide',
 'chlorthalidone',
 'LisinoprilhydroChlorothiazide',
 'Amox',
 'Hydrochlorothiazide HCTZ',
 'Lasix (Furosemide)',
 'Hydrochlorothiazidy',
 'Hydrochlothiaz

In [31]:
values_to_drop_diureticsnames = {'Ca', 'C', 'IRON', 'Ace', 'E', 'POTASSIUM', 'potassium', 'D', 'AP', 'Potassium',
                                 'Potassium ', 'ACET', 'Iron', 'Iron ', 'potassium ', 'Amox', 'iron'}
diuretics_matched_values = [x for x in diuretics_matched_values if x not in values_to_drop_diureticsnames]

In [32]:
diuretics_all = med_utils.medication_all_timepoints(
    habs_df_selected, diuretics_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# diuretics_all.to_csv("../preprocessed_data/HABSHD/diuretics_all_results_HABS_missing.csv",index=False)

### statins 

In [33]:
statin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='statin')
statin_names = list(set(statin_files['Drug_name'].dropna()).union(statin_files['Brand_name'].dropna()))

In [34]:
statin_matched_values = med_utils.get_matched_values(med_df, statin_names)

In [35]:
statin_matched_values

['ROSUVASTATIN CAL',
 'lipitor',
 'pravastatin SOD',
 'Crestor/Rosuvastatin calcium)',
 'Atorvastatin',
 'Crestor',
 'Simvastatin ABS',
 'Simvastasin',
 'Ezetimbe/simvastatin',
 'E',
 'Rosuvastatin Tab',
 'Pravastatin SOD',
 'Atorvastatin ',
 'Liptor',
 'EzetimibeSimvastatin',
 'lipitor tabs',
 'Livalo',
 'Crestor (Generic Drug: Rosuvastatin)',
 'ROSVASTATIN',
 'Lovastatin ',
 'Atrorvastatin',
 'Pravastatin sodium',
 'Rosuvastatin Tabs',
 'Pervastatin',
 'Atorvastatin Calcium Tablets, USP',
 'Rusuvastatin',
 'LOVASTATIN',
 'Study med (atorvastatin or placebo)',
 'Sivastatin',
 'Atorvastatina',
 'Pravstatin',
 'AFORVASTATIN',
 'Rosuvatatin',
 'ATORVASTATIN CALCIUM',
 'Crestor rosuvastatin calcium',
 'SIMVASTATIN',
 'Proavastatine',
 'Lovastatin',
 'ROSUVATASTIN',
 'Simvastatin ',
 'ROSUVASTATIN',
 'Rosuvastatin CA',
 'Pravastatin Sod',
 'Atorvastatin Calcium ',
 'rosuvastatin calcium',
 'Statin',
 'Atorvastain ',
 'Pravastatin Calcium',
 'Atortvastatin ',
 'ATORVASTATIN',
 'Simuastatin'

In [36]:
values_to_drop_statinnames = {'C', 'E'}
statin_matched_values = [x for x in statin_matched_values if x not in values_to_drop_statinnames]

In [37]:
statin_all = med_utils.medication_all_timepoints(
    habs_df_selected, statin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# statin_all.to_csv("../preprocessed_data/HABSHD/statin_all_results_HABS_missing.csv",index=False)

### metformin

In [38]:
metformin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='metformin')
metformin_names = list(set(metformin_files['Drug_name'].dropna()).union(metformin_files['Brand_name'].dropna()))

In [39]:
metformin_matched_values = med_utils.get_matched_values(med_df, metformin_names)

In [40]:
metformin_matched_values

['Metformin HCL',
 'Sitagliptin and Metformin HCl',
 'Metformin ER',
 'MetFormin HCL',
 'E',
 'metFORMIN XR',
 'metformin HCL ER',
 'metformin time release',
 'Glipizide Metformin',
 'Metformin/ hydrochloride',
 'Metformin',
 'Metformin HCI',
 'Pioglitazone-Metformin',
 'Metformin XR',
 'Metformin HCL ',
 'Metformin hcl',
 'Metoformin',
 'metoformin',
 'Metformin HCL ER',
 'Metformin Hydrochloride',
 'Glipizide-Metformin',
 'Metformin Hcl',
 'metformin hydrochloride',
 'Metformin XL',
 'D',
 'Metformin Extended Release',
 'METFORMIN',
 'Metformin er',
 'glucophage',
 'metformin XL',
 'Pioglitazone Metformin',
 'C',
 'METFORMIN HCL XR',
 'Metformin ',
 'metformin hcl',
 'metFORMIN Hcl',
 'Meformin',
 'Kombiglyze XR',
 'METAFORMIN',
 'Glyburide-Metformin',
 'Metformin HCl ER',
 'metformin ',
 'METFORMIN HCL',
 'metformin (Glucophage XR)',
 'glipizide-metformin',
 'Glucophage Metformin',
 'Metformin HCL XR',
 'Metformin Extended',
 'metformin',
 'Metformin HCL 1,000 Mg',
 'metformin HCL',

In [41]:
values_to_drop_metforminnames = {'E', 'C', 'D'}
metformin_matched_values = [x for x in metformin_matched_values if x not in values_to_drop_metforminnames]

In [42]:
metformin_all = med_utils.medication_all_timepoints(
    habs_df_selected, metformin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# metformin_all.to_csv("../preprocessed_data/HABSHD/metformin_all_results_HABS_missing.csv",index=False)

### control group
never have any of the 5 type of medications

In [43]:
drug_lists = [
    acei_matched_values,
    arb_matched_values,
    beta_blockers_matched_values,
    Ca_channel_blockers_matched_values,
    diuretics_matched_values,
    statin_matched_values,
    metformin_matched_values
]

all_target_drugs = set().union(*drug_lists)

In [44]:
row_has_target_drug = (
    habs_df_selected[medication_columns]
      .applymap(lambda x: med_utils.contains_any_medication_type(x, all_target_drugs))
      .any(axis=1)
)
ids_with_any_target_drug = habs_df_selected.loc[row_has_target_drug, 'id'].unique()
controls_df = habs_df_selected[~habs_df_selected['id'].isin(ids_with_any_target_drug)].copy()


In [45]:
controls_df["Has_Medication_This_Visit"] = False
controls_df = controls_df.drop(columns=[c for c in medication_columns if c in controls_df.columns])
# controls_df.to_csv("../preprocessed_data/HABSHD/controls_df_HABS_missing.csv",index=False)

### Adjust for the later modelling in R

In [46]:
datasets = {
    "ACEi": acei_all,
    "ARB": arb_all,
    "BetaBlk": beta_blockers_all,
    "CCB": Ca_channel_blockers_all,
    "Diuretic": diuretics_all,
    "Statin": statin_all,
    "Metformin": metformin_all,
    "Control": controls_df  
}

In [47]:
merged_df = med_utils.merge_medication_longitudinal(datasets, control_key="Control", extra_cols=["visit_no", "MMSE", "CDR"])

In [48]:
print(f"Initial rows: {len(merged_df)}")
merged_clean = merged_df.dropna(subset=['status'])
merged_clean = merged_clean[~(merged_clean['MMSE'].isna() & merged_clean['CDR'].isna())]
print(f"After dropping missings: {len(merged_clean)}")
merged_clean = merged_clean[merged_clean.groupby('id')['visit_no'].transform('nunique') > 1]
print(f"After removing number of records: {len(merged_clean)}")

Initial rows: 5931
After dropping missings: 5931
After removing number of records: 3537


In [49]:
merged_clean['visit_date'].isna().sum()

0

In [50]:
merged_clean['visit_date'] = pd.to_datetime(merged_clean['visit_date'])
merged_clean['months_since_baseline'] = (
    merged_clean.groupby('id')['visit_date']
      .transform(lambda x: (x - x.min()) / pd.Timedelta(days=30.4375))
      .round()
    .astype(int)
)

In [51]:
merged_clean = med_utils.clean_and_filter_participants(
    merged_clean,
    id_col='id',
    class_col ='status',
    time_col = 'visit_no',
    medication_ever_col=None,
    min_visits=2,
    if_print=True
)

Remaining participants: 1,425
Dropped participants:   0


In [52]:
merged_clean[merged_clean.duplicated(subset=['id', 'visit_no'], keep=False)].sort_values(['id','visit_no'])

Unnamed: 0,id,visit_date,status,age,sex,edu,APOE4,ACEi,ARB,BetaBlk,CCB,Diuretic,Statin,Metformin,Total_Meds,visit_no,MMSE,CDR,months_since_baseline


In [53]:
merged_clean = med_utils.clean_longitudinal(merged_clean)

In [54]:
merged_clean

Unnamed: 0,id,visit_date,status,age,sex,edu,APOE4,ACEi,ARB,BetaBlk,CCB,Diuretic,Statin,Metformin,Total_Meds,visit_no,MMSE,CDR,months_since_baseline
0,4369,2022-05-01,HC,54.50,1,11.0,0.0,False,False,False,False,False,False,False,0,1,27.0,0.0,0
1,4369,2024-04-01,MCI,56.42,1,11.0,0.0,False,False,False,False,False,False,False,0,2,29.0,1.0,23
2,4399,2022-04-01,HC,64.00,1,18.0,0.0,False,False,False,False,False,False,False,0,1,28.0,0.0,0
3,4399,2024-06-01,MCI,66.17,1,18.0,0.0,False,False,False,False,False,False,False,0,2,28.0,1.0,26
4,4400,2022-04-01,HC,51.25,1,18.0,0.0,False,False,False,True,False,False,True,2,1,27.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3532,9996,2022-07-01,MCI,67.75,0,12.0,0.0,False,False,False,True,False,False,False,1,3,23.0,1.0,57
3533,9997,2017-09-01,HC,63.67,0,11.0,0.0,False,False,False,False,False,False,False,0,1,28.0,0.0,0
3534,9997,2019-09-01,MCI,65.67,0,11.0,0.0,False,False,False,False,False,False,False,0,2,24.0,0.5,24
3535,9999,2019-04-01,HC,80.08,1,6.0,0.0,False,True,False,True,False,False,False,2,1,23.0,0.0,0


In [55]:
merged_clean['year_since_baseline'] = (merged_clean['months_since_baseline']/ 12).round(2)

In [56]:
merged_clean.to_csv("../preprocessed_data/HABSHD/HABSHD_merge_full_missing.csv",index=False)

In [57]:
max_tab = merged_clean.groupby('id')['Total_Meds'].max().value_counts()
print("\nDistribution of max Total_Meds per participant:\n", max_tab)



Distribution of max Total_Meds per participant:
 Total_Meds
0    368
1    336
2    295
3    260
4    125
5     36
6      5
Name: count, dtype: int64


In [15]:
merged_clean = pd.read_csv('../preprocessed_data/HABSHD/HABSHD_cdr_imputed.csv')

In [7]:
summary_df = med_utils.baseline_summary(merged_clean)
summary_df

Unnamed: 0,Measure,Value
0,Age at baseline (year),65.7 ± 8.4
1,Gender (Female),909 (63.8%)
2,Education (year),13.3 ± 4.3
3,APOE4 (YES)**,357 (25.1%)
4,Visits (Record),2.0 – 4.0
5,Follow up intervals (Month),41.2 ± 19.4
6,CU at baseline,1139 (79.9%)
7,MCI at baseline,226 (15.9%)
8,AD at baseline,60 (4.2%)
9,Average medication taken at baseline,1.3 ± 1.3


In [18]:
med_cols_names = ["ACEi", "ARB", "BetaBlk", "CCB", "Diuretic", "Statin", "Metformin"]
med_utils.baseline_age_by_med(merged_clean, med_cols_names)

Unnamed: 0,drug,user,n,mean,sd,mean_sd,wald_chi2,wald_p
0,ACEi,non-user,1128,65.32,8.31,65.32 (8.31),8.888645,0.002918251
1,ACEi,user,297,66.94,8.42,66.94 (8.42),8.888645,0.002918251
2,ARB,non-user,1220,65.19,8.36,65.19 (8.36),26.996696,2.334259e-07
3,ARB,user,205,68.44,7.77,68.44 (7.77),26.996696,2.334259e-07
4,BetaBlk,non-user,1214,65.14,8.31,65.14 (8.31),32.502583,1.445146e-08
5,BetaBlk,user,211,68.65,7.99,68.65 (7.99),32.502583,1.445146e-08
6,CCB,non-user,1228,65.27,8.34,65.27 (8.34),19.47688,1.09505e-05
7,CCB,user,197,68.08,8.04,68.08 (8.04),19.47688,1.09505e-05
8,Diuretic,non-user,1277,65.42,8.3,65.42 (8.3),10.471841,0.001239959
9,Diuretic,user,148,67.76,8.51,67.76 (8.51),10.471841,0.001239959
