In [1]:
import pandas as pd
import numpy as np
import importlib
import re
import med_utils
importlib.reload(med_utils)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
AIBL_df = pd.read_excel('../raw_datasets/AIBL/AIBL_2023.xlsx')

In [3]:
AIBL_df_clean = med_utils.clean_dataframe_advanced(AIBL_df, missing_values=[" "], threshold=1, drop_rows_threshold=None, verbose=True)
AIBL_df_clean['Demographic.YearMonthOfBirth']=AIBL_df_clean['Demographic.YearMonthOfBirth'].astype(str) + '01'
AIBL_df_clean['Demographic.YearMonthOfBirth'] = pd.to_datetime(AIBL_df_clean['Demographic.YearMonthOfBirth'], format='%Y%m%d')
AIBL_df_clean[['Demographic.YearMonthOfBirth']] = AIBL_df_clean.groupby('AIBL ID')[['Demographic.YearMonthOfBirth']].transform(lambda x: x.ffill().bfill())
AIBL_df_clean['Progress Summary.Date of NP assessment'] = pd.to_datetime(AIBL_df_clean['Progress Summary.Date of NP assessment'], format='%d/%m/%Y')
AIBL_df_clean['Age'] = round((AIBL_df_clean['Progress Summary.Date of NP assessment'] - AIBL_df_clean['Demographic.YearMonthOfBirth']).dt.days / 365, 2)
print("Missing AIBL Age records:", AIBL_df_clean["Age"].isna().sum())
AIBL_df_clean = med_utils.fill_missing_ages_AIBL(AIBL_df_clean)
print("Missing AIBL Age records after fill given assumption of 18 month of AIBL visits:", AIBL_df_clean["Age"].isna().sum())

Original shape: (37818, 1400)
Missing values replaced: [' ']
Columns dropped: 0 ([])
Final shape: (37818, 1400)
Missing AIBL Age records: 27659
Missing AIBL Age records after fill given assumption of 18 month of AIBL visits: 3619


In [4]:
# drop rows with AIBL Age or Neuropsych.Simple Classification (status) missing
mask_missing = AIBL_df_clean["Age"].isna() | AIBL_df_clean["Neuropsych.Simple Classification"].isna()
print(f"Rows with missing Age or status to drop: {mask_missing.sum()}")
AIBL_df_clean = AIBL_df_clean.loc[~mask_missing].copy()
print(f"After dropping missings: {len(AIBL_df_clean)}")


Rows with missing Age or status to drop: 13220
After dropping missings: 24598


In [5]:
medication_columns = [col for col in AIBL_df_clean.columns if col.startswith("Medical History.Name of medication")]
AIBL_df_selected = med_utils.rename_and_select_columns(AIBL_df_clean, "../raw_datasets/naming_convension.xlsx", "AIBL", "New_Name", extra_cols=medication_columns)

In [6]:
AIBL_df_selected

Unnamed: 0,id,visit_no,visit_date,status,age,sex,edu,APOE4,CDR,MMSE,...,Medical History.Name of medication 12,Medical History.Name of medication 13,Medical History.Name of medication 14,Medical History.Name of medication 15,Medical History.Name of medication 16,Medical History.Name of medication 17,Medical History.Name of medication 18,Medical History.Name of medication 19,Medical History.Name of medication 20,Medical History.Name of medication 21
0,2,1,2006-11-03,HC,65.30,Male,16.0,E3/E3,0.0,29.0,...,,,,,,,,,,
1,2,2,2008-09-16,HC,67.17,Male,16.0,E3/E3,0.0,30.0,...,,,,,,,,,,
2,2,3,2010-03-24,HC,68.69,Male,16.0,E3/E3,0.0,30.0,...,,,,,,,,,,
3,2,4,2011-09-16,HC,70.17,Male,16.0,E3/E3,0.0,30.0,...,,,,,,,,,,
4,2,5,2013-04-11,HC,71.74,Male,16.0,E3/E3,0.0,30.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35580,3421,7,NaT,Withdrawn,76.95,Female,,,,,...,,,,,,,,,,
35581,3421,8,NaT,Withdrawn,78.45,Female,,,,,...,,,,,,,,,,
35582,3421,9,NaT,Withdrawn,79.95,Female,,,,,...,,,,,,,,,,
35583,3421,10,NaT,Withdrawn,81.45,Female,,,,,...,,,,,,,,,,


In [7]:
print("The dimension of the selected AIBL dataframe is:", AIBL_df_selected.shape, "with", AIBL_df_selected['id'].nunique(), "unique participants.")
AIBL_df_selected = AIBL_df_selected[AIBL_df_selected['status'].isin(['HC', 'MCI', 'AD'])]
AIBL_df_selected['sex'] = AIBL_df_selected['sex'].map({"Male":0 , 'Female':1})
AIBL_df_selected['APOE4'] = AIBL_df_selected['APOE4'].apply(lambda x: 1 if isinstance(x, str) and 'E4' in x else 0)

print("The dimension of the selected AIBL dataframe after drop unrelated with AD is:", AIBL_df_selected.shape, "with", AIBL_df_selected['id'].nunique(), "unique participants.")
AIBL_df_selected = med_utils.clean_status_and_demos(AIBL_df_selected)
AIBL_df_selected = AIBL_df_selected.dropna(subset=['status'])
AIBL_df_selected = AIBL_df_selected[~(AIBL_df_selected['MMSE'].isna() & AIBL_df_selected['CDR'].isna())]
AIBL_df_selected = AIBL_df_selected.reset_index(drop=True)
print("The dimension of the selected AIBL dataframe is:", AIBL_df_selected.shape, "with", AIBL_df_selected['id'].nunique(), "unique participants.")
med_df=AIBL_df_selected[medication_columns]
selected_columns = AIBL_df_selected.columns.tolist()

The dimension of the selected AIBL dataframe is: (24598, 31) with 2790 unique participants.
The dimension of the selected AIBL dataframe after drop unrelated with AD is: (9503, 31) with 2503 unique participants.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AIBL_df_selected['sex'] = AIBL_df_selected['sex'].map({"Male":0 , 'Female':1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AIBL_df_selected['APOE4'] = AIBL_df_selected['APOE4'].apply(lambda x: 1 if isinstance(x, str) and 'E4' in x else 0)


Filled 0 missing status records; columns forward/back filled: ['sex', 'edu', 'APOE4']
The dimension of the selected AIBL dataframe is: (9487, 31) with 2502 unique participants.


### angiotensin-converting enzyme inhibitors (ACEi)

In [8]:
acei_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ACEi')
acei_names = list(set(acei_files['Drug_name'].dropna()).union(acei_files['Brand_name'].dropna()))

In [9]:
acei_matched_values = med_utils.get_matched_values(med_df, acei_names)

In [10]:
acei_matched_values

['perindopril / indapamide',
 'CM-Perindopril',
 'PRINIVIL',
 'Gopten (trandolapril)',
 'monopril',
 'Coversyl, perindopril, arginine',
 'RamiprilWinthrop',
 'Captopril Tablets',
 'accupril',
 'Ramipril sz',
 '30 Perindopril',
 'Indapam/perindopril',
 'Ramipril',
 'Accupril',
 'QUINAPRIL',
 'Perindo',
 'Coversyl Plus (Perindopril, indapamde-hemhydrate',
 'Lisinopril',
 'Quinapril',
 'Rampipril',
 'Perindopril arginine',
 'Penindopril',
 'perindo',
 'Enalapril Acetec',
 'PERINDOPRIL',
 'Perindopril',
 'Enalapril Maleate',
 'rammipril',
 'Ramipril ( will change to Atacand soon)',
 'perendopril',
 'Monopril',
 'MONOPRIL',
 'Perindopril/Indapamide',
 'Lisiuopril',
 'perindopril erbumine',
 'Azep',
 'Rampril',
 'ramipril',
 'CAPOTEN',
 'Capoten',
 'Zestril',
 'Fosinopril',
 'perindopril',
 'Perindopril Erbumine',
 'Perendopril',
 'Fe',
 'Enalapril',
 'perindopril/indapamide',
 'rampril',
 'Perindapril',
 'Tryzan Ramipril',
 'ACE',
 'zestril',
 'RAMIPRIL',
 'Bace',
 'LISINOPRIL/LISODOR',
 'l

In [11]:
values_to_drop_aceinames = {'Azep', 'Fe', 'Bace', }
acei_matched_values = [x for x in acei_matched_values if x not in values_to_drop_aceinames]

In [12]:
acei_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, acei_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# acei_all.to_csv("../preprocessed_data/AIBL/acei_all_results_AIBL_missing.csv",index=False)

### angiotensin receptor blockers (ARBs)

In [13]:
arb_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='ARB')
arb_names = list(set(arb_files['Drug_name'].dropna()).union(arb_files['Brand_name'].dropna()))

In [14]:
arb_matched_values = med_utils.get_matched_values(med_df, arb_names)

In [15]:
arb_matched_values

['Olmtec',
 'Olmetec',
 'candesartan',
 'Olmetec Plus',
 'Valsartan amlodipine',
 'diovan',
 'Karvezide (150 mg irbesartan/12.5 mg hydrochlorothiazide)',
 'MICARDIS PLUS',
 'Olmetec (olmesaran medoxomil)',
 'Micardis',
 'candestartan',
 'Avapro HCT (150/12.5 mg)',
 'Exforge (anlodifine/valsartan)',
 'olmetec plus',
 'avapro hct',
 'co diovan',
 'Valsartan',
 'Avapro 150mg/12.50mg',
 'avapro-hct',
 'ATACAN',
 'Apo-Candesartan',
 'Irbersatan',
 'Atacand',
 'Candezartan',
 'attacand',
 'avarpro',
 'Irbesarten',
 'Candesartan Aspen',
 'Micardis Plus',
 'Telmisartan/Hydrochlorothiazide',
 'Olmesartan',
 'valsartan/hydrochlorothiazide',
 'IRBESARTEN',
 'olmosartan',
 'Exforge (amlodipine/valsartan)',
 'micadis',
 'Micardis plus 80/12.5',
 'Olmestartan',
 'Telmisartan/HCT sandoz',
 'Telmisartan/amcodipine',
 'MICARDIS',
 'ATACAND PLUS',
 'Atacand Plus 16/12.5',
 'irbesartan HCT2',
 'Avapro (Blood pressure)',
 'CO-DIOVAN',
 'Atacan',
 'Olmesartan amlodipine',
 'irbestartan',
 'atacand +',
 'To

In [16]:
values_to_drop_arbnames = {'VE'}
arb_matched_values = [x for x in arb_matched_values if x not in values_to_drop_arbnames]

In [17]:
arb_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, arb_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# arb_all.to_csv("../preprocessed_data/AIBL/arb_all_results_AIBL_missing.csv",index=False)

### Beta Blocker

In [18]:
beta_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='beta_blockers')
beta_blockers_names = list(set(beta_blockers_files['Drug_name'].dropna()).union(beta_blockers_files['Brand_name'].dropna()))

In [19]:
beta_blockers_matched_values = med_utils.get_matched_values(med_df, beta_blockers_names)

In [20]:
beta_blockers_matched_values

['Metopropol',
 'SOTALOL',
 'atenalol',
 'Sotalol hydrocholride',
 'metoprolol  AN',
 'TERNORMIN',
 'Metoprolol Tart',
 'ATENALOL',
 'timolol',
 'Betaloc (Metoprolol tartrate)',
 'toprol-XL',
 'Atenol',
 'tenormin',
 'atenolol',
 'METOPROLOL',
 'Timolol',
 'metopropol',
 'METOPROLOL (Minax)',
 'METOPOLOL',
 'DILATREND (Carvedilol)',
 'Atenolol',
 'Apo metoprolol',
 'Apo-Metoprolol',
 'METOPROLOL TARTRATE',
 'Tenormin',
 'Tenormin Noten Tensig Atehexal',
 'Meteprolol',
 'propanolol',
 'tenaormin',
 'metropolol',
 'Propanolol',
 'VE',
 'Carvedilol (Dicarz)',
 'carvediol',
 'Metoprolol',
 'metoprocol',
 'Betaloc (Beta blocker)',
 'Bisoprolol Fumarate',
 'Metropolol',
 'Timolol Eye drops',
 'ATENOLOL',
 'nebivolol',
 'METROPROLOL',
 'Inderal',
 'propraholol',
 'Sotalol',
 'Toprol XL',
 'sotalol',
 'metopolol',
 'Metoprolol Tartrate',
 'Bisoprolol',
 'bisoprodol',
 'Metoprolol tartrate (sz)',
 'metoprolol',
 'Timolol eye-drops 0.5%',
 'metroprolol',
 'Nebivolol',
 'Metoprolo',
 'carvedilol'

In [21]:
values_to_drop_BetaBlknames = {'ACE', 'VE'}
beta_blockers_matched_values = [x for x in beta_blockers_matched_values if x not in values_to_drop_BetaBlknames]

In [22]:
beta_blockers_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, beta_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# beta_blockers_all.to_csv("../preprocessed_data/AIBL/beta_blockers_all_results_AIBL_missing.csv",index=False)

### Calcium Channel Blockers

In [23]:
Ca_channel_blockers_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='Ca_channel_blockers')
Ca_channel_blockers_names = list(set(Ca_channel_blockers_files['Drug_name'].dropna()).union(Ca_channel_blockers_files['Brand_name'].dropna()))

In [24]:
Ca_channel_blockers_matched_values = med_utils.get_matched_values(med_df, Ca_channel_blockers_names)

In [25]:
Ca_channel_blockers_matched_values

['lercanidipone',
 'Amiodipine',
 'LERCAN',
 'amlodopine',
 'Diltiazem',
 'Valsartan amlodipine',
 'lercandipine',
 'Norvascc',
 'Nifedipine Adefin XL',
 'lercanidipine',
 'Lercandipine',
 'Nifedipine',
 'Amolodipine',
 'apo- lercanidipine',
 'PLENDIL',
 'zanidip',
 'CALCIUM',
 'Calcium',
 'amlodipine',
 'Lercanidipine-HCI',
 'Pritor/Amlodipine',
 'Amlodipine',
 'calcium +',
 'Exforge (amlodipine/valsartan)',
 'Zanidip, 20 ml',
 'norvasc',
 'tazac',
 'cardizem cd',
 'FELODIPINE',
 'nifedipine',
 'ZANIDIP/ZIRCOL',
 'amlo',
 'Olmesartan amlodipine',
 'Verapamil',
 'Norvac',
 'calcium',
 'Amrodipine',
 'Apo-amlodipine',
 'AMLO',
 'LERCANIDIPINE/HYDROCHLORIDE',
 'Norvasc',
 'Felodipine ER',
 'Amlodipine Besylate',
 'PLENDIL ER',
 'AMLODAPINE',
 'Amlo',
 'Lercanidipine Hydrochloride',
 'Plendil',
 'ADALAT',
 'Adalat',
 'Lercanidipine (Zircol)',
 'Tazac',
 'adalat',
 'nitedipine',
 'lerconidipine',
 'VE',
 'almodipine',
 'plendiler',
 'NAC',
 'Cardizem',
 'ZANIDIP',
 'felodipine',
 'cardizem

In [26]:
values_to_drop_ccbnames = {'calcium +', 'Calcium', 'calcium', 'CALCIUM', 'Fe', 'VE', 'NAC',}
Ca_channel_blockers_matched_values = [x for x in Ca_channel_blockers_matched_values if x not in values_to_drop_ccbnames]

In [27]:
Ca_channel_blockers_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, Ca_channel_blockers_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# Ca_channel_blockers_all.to_csv("../preprocessed_data/AIBL/Ca_channel_blockers_all_results_AIBL_missing.csv",index=False)

### Diuretics

In [28]:
diuretics_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='diuretics')
diuretics_names = list(set(diuretics_files['Drug_name'].dropna()).union(diuretics_files['Brand_name'].dropna()))

In [29]:
diuretics_matched_values = med_utils.get_matched_values(med_df, diuretics_names)

In [30]:
diuretics_matched_values

['perindopril / indapamide',
 'spirano lactone',
 'Spironolacton',
 'LASIX-M',
 'Spiractin 25',
 'Telmisartan/Hydrochlorothiazide',
 'Valsartan/Hydrochlorothiazide',
 'Hydrochlorothiazide',
 'Lasix',
 'LASIX',
 'aldactone',
 'Chlorthal',
 'valsartan/hydrochlorothiazide',
 'lasix M',
 'none',
 'furosemide',
 'indapanmide',
 'hydrchlorothiazide',
 'Aldactone',
 'Spironolactone',
 'diuretic',
 'Karvezide (150 mg irbesartan/12.5 mg hydrochlorothiazide)',
 'INDAPAMIDE',
 'SPIRONOLACTONE',
 'indapamide',
 'spirolactone',
 'Perindopril/Indapamide',
 'Urex (Furosemide)',
 'Iron',
 'Spiractin',
 'Indapamide SR',
 'IRON',
 'Furosemide',
 'Indapamide',
 'Potassium',
 'iron',
 'hydrochlorothiazide',
 'spiractin',
 'Avapro HCT  Hydrochlorothiazide',
 'hygroton',
 'perindopril/indapamide',
 'hydrochlorothiazidie',
 'ACE',
 'Spiractin (Aldactone)',
 'Natrilix (Indapamide hemihydrate)',
 'lasix',
 'Diuretic',
 'Hygroton',
 'Valsartan hydrochlorothiazide',
 'Spiractin (Spironolactose)',
 'pironolactone

In [31]:
values_to_drop_diureticsnames = {'Iron', 'Potassium', 'ACE', 'none', 'IRON', 'iron'}
diuretics_matched_values = [x for x in diuretics_matched_values if x not in values_to_drop_diureticsnames]

In [32]:
diuretics_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, diuretics_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# diuretics_all.to_csv("../preprocessed_data/AIBL/diuretics_all_results_AIBL_missing.csv",index=False)

### statins 

In [33]:
statin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='statin')
statin_names = list(set(statin_files['Drug_name'].dropna()).union(statin_files['Brand_name'].dropna()))

In [34]:
statin_matched_values = med_utils.get_matched_values(med_df, statin_names)

In [35]:
statin_matched_values

['Pravachol',
 'Rosuvastatin Apotex',
 'Lipitor??',
 'rosuvastin',
 'Rosvastatin',
 'pravastin',
 'Simvarstatin',
 'Atorvachol/Atorvastatin/Lipitor',
 'Zocor (Zimvastatin)',
 'Atorvastin',
 'Ezetimibel Rosuvastatin',
 'atoravastatin',
 'ato-atorvastatin',
 'risuvastatin',
 'LIPITOR',
 'Crestor (5mgm)',
 'Atoruastatin',
 'SM-Simvastatin',
 'rosuvostatin',
 'CRESTOR',
 'atorvastaten',
 'SIMVASTATIN',
 'Prevastatin',
 'Liptor',
 'Simbastatin',
 'rosuvstatin',
 'lipitor',
 'atoruastatin',
 'Pravacol',
 'pravacol',
 'Rosulvastatin',
 'ZOCOR',
 'pravachol',
 'Simvastin',
 'ATORVASTATIN SANDOZ',
 'apo- rosuvastatin',
 'zocor',
 'Symvastatin',
 'Zocor (simvastatin)',
 'PRAVACHOL',
 'Simuastatin',
 'apo rosuvastatin',
 'Apo-Rosuvastatin',
 'Provastatin',
 'amalodipine vasolate/atorvastatin calcium',
 'ATORVASTATIN',
 'Atorvastatin (sz)',
 'SIMVASTIN',
 'pravastatin',
 'Rosuvastin',
 'ASORVASTATIN',
 'Lipitor',
 'Crestor (cholesterol)',
 'Crestor',
 'Lipator (Atorvastatin)',
 'Lipitor 20mg',
 'S

In [36]:
values_to_drop_statinnames = {'Iron', 'Potassium', 'ACE', 'none', 'IRON', 'iron'}
statin_matched_values = [x for x in statin_matched_values if x not in values_to_drop_statinnames]

In [37]:
statin_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, statin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# statin_all.to_csv("../preprocessed_data/AIBL/statin_all_results_AIBL_missing.csv",index=False)

### metformin

In [38]:
metformin_files = pd.read_excel('../raw_datasets/names.xlsx', sheet_name='metformin')
metformin_names = list(set(metformin_files['Drug_name'].dropna()).union(metformin_files['Brand_name'].dropna()))

In [39]:
metformin_matched_values = med_utils.get_matched_values(med_df, metformin_names)

In [40]:
metformin_matched_values

['DIABEX',
 'formetaspen',
 'DIAFORMIN',
 'Diabex XR ER',
 'medformin',
 'metformin',
 'diaformin xr',
 'Metformin Hydrochloride',
 'Diabex',
 'METFORMIN',
 'GLUCOVANCE',
 'Medformin',
 'DIAFORMIN 500',
 'Diabex XR 1000',
 'Jardiamet',
 'VE',
 'Metex',
 'jardiamet',
 'Diabex XR',
 'Metformin',
 'Jardiamet 5/1000',
 'Formet Aspen',
 'Metformin HCL',
 'Glucovance (500/5mg)',
 'Diaformin',
 'diabex',
 'metformin hydrochloride',
 'Glucophage',
 'Glucohexal',
 'Diaformin XR',
 'dianformin',
 'Diaformin Xr',
 'Diaformin (1gm)',
 'diabex XD',
 'DIABEX XR',
 'metex xr',
 'Glucovance',
 'diaformin']

In [41]:
values_to_drop_metforminnames = {'VE'}
metformin_matched_values = [x for x in metformin_matched_values if x not in values_to_drop_metforminnames]

In [42]:
metformin_all = med_utils.medication_all_timepoints(
    AIBL_df_selected, metformin_matched_values, medication_columns, selected_columns,
    id_col="id", visit_col="visit_no"
)
# metformin_all.to_csv("../preprocessed_data/AIBL/metformin_all_results_AIBL_missing.csv",index=False)

### control group
never have any of the 5 type of medications

In [43]:
drug_lists = [
    acei_matched_values,
    arb_matched_values,
    beta_blockers_matched_values,
    Ca_channel_blockers_matched_values,
    diuretics_matched_values,
    statin_matched_values,
    metformin_matched_values
]

all_target_drugs = set().union(*drug_lists)

In [44]:
row_has_target_drug = (
    AIBL_df_selected[medication_columns]
      .applymap(lambda x: med_utils.contains_any_medication_type(x, all_target_drugs))
      .any(axis=1)
)
ids_with_any_target_drug = AIBL_df_selected.loc[row_has_target_drug, 'id'].unique()
controls_df = AIBL_df_selected[~AIBL_df_selected['id'].isin(ids_with_any_target_drug)].copy()


In [45]:
controls_df["Has_Medication_This_Visit"] = False
controls_df = controls_df.drop(columns=[c for c in medication_columns if c in controls_df.columns])
# controls_df.to_csv("../preprocessed_data/AIBL/controls_df_AIBL_missing.csv",index=False)

### Adjust for the later modelling in R

In [46]:
datasets = {
    "ACEi": acei_all,
    "ARB": arb_all,
    "BetaBlk": beta_blockers_all,
    "CCB": Ca_channel_blockers_all,
    "Diuretic": diuretics_all,
    "Statin": statin_all,
    "Metformin": metformin_all,
    "Control": controls_df  
}

In [47]:
merged_df = med_utils.merge_medication_longitudinal(datasets, control_key="Control", extra_cols=["visit_no", "MMSE", "CDR"])

In [48]:
print(f"Initial rows: {len(merged_df)}")
merged_clean = merged_df.dropna(subset=['status'])
merged_clean = merged_clean[~(merged_clean['MMSE'].isna() & merged_clean['CDR'].isna())]
print(f"After dropping missings: {len(merged_clean)}")
merged_clean = merged_clean[merged_clean.groupby('id')['visit_no'].transform('nunique') > 1]
print(f"After removing number of records: {len(merged_clean)}")

Initial rows: 9210
After dropping missings: 9210
After removing number of records: 8472


In [49]:
merged_clean = med_utils.clean_and_filter_participants(
    merged_clean,
    id_col='id',
    class_col ='status',
    time_col = 'visit_no',
    medication_ever_col=None,
    min_visits=2,
    if_print=True
)

Remaining participants: 1,711
Dropped participants:   0


In [50]:
### AIBL use the age informations,
merged_clean['months_since_baseline'] = (
    merged_clean.groupby('id')['age']
      .transform(lambda x: (x - x.min()) * 12)  # convert years to months
      .round()
      .astype(int)
)

In [51]:
merged_clean = med_utils.clean_longitudinal(merged_clean)

In [52]:
# merged_clean[merged_clean.duplicated(subset=['id', 'visit_no'], keep=False)].sort_values(['id','visit_no'])

In [53]:
max_tab = merged_clean.groupby('id')['Total_Meds'].max().value_counts()
print("\nDistribution of max Total_Meds per participant:\n", max_tab)



Distribution of max Total_Meds per participant:
 Total_Meds
0    883
1    488
2    239
3     87
4     13
5      1
Name: count, dtype: int64


In [54]:
merged_clean.to_csv("../preprocessed_data/AIBL/AIBL_merge_full_missing.csv",index=False)

In [55]:
summary_df = med_utils.baseline_summary(merged_clean)
summary_df

Unnamed: 0,Measure,Value
0,Age at baseline (year),72.1 ± 7.3
1,Gender (Female),949 (55.5%)
2,Education (year),12.8 ± 3.1
3,APOE4 (YES)**,610 (35.7%)
4,Visits (Record),4.0 [3.0–7.0]
5,Follow up intervals (Month),8.0 – 189.0
6,CU at baseline,1211 (70.8%)
7,MCI at baseline,252 (14.7%)
8,AD at baseline,248 (14.5%)
9,Average medication taken at baseline,0.5 ± 0.7
