In [224]:
import pandas as pd
import numpy as np

In [225]:
# Load the required csv files
registry_csv_file_path = "../data/REGISTRY_29Oct2024.csv"                       # For EXAMDATE and RID
genetics_csv_file_path = "../data/GENETIC_29Oct2024.csv"                        # For APVOLUME
upenbiomk_csv_file_path = "../data/UPENNBIOMK_ROCHE_ELECSYS_29Oct2024.csv"      # For ABETA42, TAU and PTAU
fugi_abeta_path = "../data/FUJIREBIOABETA_05Nov2024.csv"                        # For ABETA42, ABETA42/40
ugotptau181_csv_file_path = "../data/UGOTPTAU181_06_18_20_29Oct2024.csv"        # For PLASMAPTAU181
blenn_cfl_nfl_path = "../data/BLENNOWCSFNFL_05Nov2024.csv"                      # For CSF Nfl
ucsffsx6_csv_file_path = "../data/UCSFFSX6_07_06_23_29Oct2024.csv"              # For Cortical thickness in certain temporal region
ucsffsx51_csv_file_path = "../data/UCSFFSL51_03_01_22_29Oct2024.csv"            # For Cortical thickness in ceratin temporal region
ptdemog_csv_file_path = "../data/PTDEMOG_29Oct2024.csv"                         # For Patient gender and DOB
medhist_csv_file_path = "../data/MEDHIST_29Oct2024.csv"                         # For medical history of patient


# Selected columns
registry_selected_columns = [
    "RID",
    "EXAMDATE"
]

genetics_selected_columns = [
    "RID",
    "VISDATE",
    "APVOLUME"    
]
upenbiomk_selected_columns = [
    "RID",
    "EXAMDATE",
    "ABETA42",
    "TAU",
    "PTAU"
]
fugi_abeta_columns = [
    "RID",
    "EXAMDATE", 
    "ABETA42_40",
    "ABETA42",
    # "ABETA40"
]
ugotptau181_selected_columns = [
    "RID",
    "EXAMDATE",
    "PLASMAPTAU181"
]
blenn_cfl_nfl_columns = [
    "RID",
    "EXAMDATE", 
    "CSFNFL"
]
ucsffsx6_selected_columns = [
    "RID",
    "EXAMDATE",
    "ST58TA",  # Cortical Thickness Average of Left Superior Temporal
    "ST117TA", # Cortical Thickness Average of Right Superior Temporal
    "ST40TA",  # Cortical Thickness Average of Left Middle Temporal
    "ST99TA",  # Cortical Thickness Average of Right Middle Temporal
    "ST32TA",  # Cortical Thickness Average of Left Inferior Temporal
    "ST91TA",  # Cortical Thickness Average of Right Inferior Temporal
    "ST60TA",  # Cortical Thickness Average of Left Temporal Pole
    "ST119TA", # Cortical Thickness Average of Right Temporal Pole
    "ST62TA",  # Cortical Thickness Average of Left Transverse Temporal
    "ST121TA"  # Cortical Thickness Average of Right Transverse Temporal
]
ucsffsx51_selected_columns = [
    "RID",
    "EXAMDATE",
    "ST58TA",  # Cortical Thickness Average of Left Superior Temporal
    "ST117TA", # Cortical Thickness Average of Right Superior Temporal
    "ST40TA",  # Cortical Thickness Average of Left Middle Temporal
    "ST99TA",  # Cortical Thickness Average of Right Middle Temporal
    "ST32TA",  # Cortical Thickness Average of Left Inferior Temporal
    "ST91TA",  # Cortical Thickness Average of Right Inferior Temporal
    "ST60TA",  # Cortical Thickness Average of Left Temporal Pole
    "ST119TA", # Cortical Thickness Average of Right Temporal Pole
    "ST62TA",  # Cortical Thickness Average of Left Transverse Temporal
    "ST121TA"  # Cortical Thickness Average of Right Transverse Temporal
]
ptdemog_selected_columns = [
    "RID",
    "VISDATE",
    "PTGENDER",
    "PTDOB"
]
medhist_selected_columns = [
    "RID",
    "VISDATE",
    "MH14ALCH",  # Alcohol Abuse
    "MH15DRUG",  # Drug Abuse
    "MH16SMOK",  # Smoking
    "MH2NEURL",  # Neurologic (other than AD)
    "MHPSYCH"   # Psychiatric Conditions
]

pd.set_option('display.max_columns', None)

registry_df = pd.read_csv(registry_csv_file_path, usecols=registry_selected_columns)
genetics_df = pd.read_csv(genetics_csv_file_path, usecols=genetics_selected_columns)
upenbiomk_df = pd.read_csv(upenbiomk_csv_file_path, usecols=upenbiomk_selected_columns)
fugi_abeta_df = pd.read_csv(fugi_abeta_path, usecols= fugi_abeta_columns)
ugotptau181_df = pd.read_csv(ugotptau181_csv_file_path, usecols=ugotptau181_selected_columns)
blenn_cfl_nfl_df = pd.read_csv(blenn_cfl_nfl_path, usecols= blenn_cfl_nfl_columns)
ucsffsx6_df = pd.read_csv(ucsffsx6_csv_file_path, usecols=ucsffsx6_selected_columns)
ucsffsx51_df = pd.read_csv(ucsffsx51_csv_file_path, usecols=ucsffsx51_selected_columns)
ptdemog_df = pd.read_csv(ptdemog_csv_file_path, usecols=ptdemog_selected_columns)
medhist_df = pd.read_csv(medhist_csv_file_path, usecols=medhist_selected_columns)

# dfs = [registry_df, genetics_df, upenbiomk_df, ugotptau181_df, ucsffsx51_df, ptdemog_df, medhist_df]
dfs = [genetics_df, upenbiomk_df, fugi_abeta_df, blenn_cfl_nfl_df, ugotptau181_df, ucsffsx51_df, ptdemog_df, medhist_df]

# print(f"registry {registry_df['RID'].nunique()} len: {len(registry_df)}")
print(f"genetics unique RIDs: {genetics_df['RID'].nunique()} len: {len(genetics_df)}")
print(f"upenbiomk unique RIDs: {upenbiomk_df['RID'].nunique()} len: {len(upenbiomk_df)}")
print(f"fugi_abeta unique RIDs: {fugi_abeta_df['RID'].nunique()} len: {len(fugi_abeta_df)}")
print(f"ugotptau181 unique RIDs: {ugotptau181_df['RID'].nunique()} len: {len(ugotptau181_df)}")
print(f"blenn_cfl_nfl unique RIDs: {blenn_cfl_nfl_df['RID'].nunique()} len: {len(blenn_cfl_nfl_df)}")
print(f"ucsffsx6 unique RIDs: {ucsffsx6_df['RID'].nunique()} len: {len(ucsffsx6_df)}")
print(f"ucsffsx51 unique RIDs: {ucsffsx51_df['RID'].nunique()} len: {len(ucsffsx51_df)}")
print(f"ptdemog unique RIDs: {ptdemog_df['RID'].nunique()} len: {len(ptdemog_df)}")
print(f"medhist unique RIDs: {medhist_df['RID'].nunique()} len: {len(medhist_df)}")

genetics unique RIDs: 2719 len: 8302
upenbiomk unique RIDs: 1660 len: 3174
fugi_abeta unique RIDs: 423 len: 442
ugotptau181 unique RIDs: 1191 len: 3758
blenn_cfl_nfl unique RIDs: 399 len: 415
ucsffsx6 unique RIDs: 1079 len: 2091
ucsffsx51 unique RIDs: 689 len: 3311
ptdemog unique RIDs: 4379 len: 5441
medhist unique RIDs: 2491 len: 3083


In [226]:
# Convert the column name from VISADATE TO EXAMDATE, as they mean the same
for df in dfs:
    if "VISDATE" in df.columns:
        df.rename(columns={"VISDATE":"EXAMDATE"}, inplace=True)
        print(df.columns)

Index(['RID', 'EXAMDATE', 'APVOLUME'], dtype='object')
Index(['RID', 'EXAMDATE', 'PTGENDER', 'PTDOB'], dtype='object')
Index(['RID', 'EXAMDATE', 'MHPSYCH', 'MH2NEURL', 'MH14ALCH', 'MH15DRUG',
       'MH16SMOK'],
      dtype='object')


In [227]:
for df in dfs:
    for columns in df.columns:
        if columns not in ["RID", "EXAMDATE"]:
            duplicates = (
                df.groupby(["RID", "EXAMDATE"])
                .filter(lambda x: len(x)> 1 and x[columns].nunique()>1)
            )
            print(f"There are  {len(duplicates[duplicates[columns] == -4])} patients that have -4 {columns} value on the same exam date")

There are  22 patients that have -4 APVOLUME value on the same exam date
There are  0 patients that have -4 ABETA42 value on the same exam date
There are  0 patients that have -4 TAU value on the same exam date
There are  0 patients that have -4 PTAU value on the same exam date
There are  0 patients that have -4 ABETA42 value on the same exam date
There are  0 patients that have -4 ABETA42_40 value on the same exam date
There are  0 patients that have -4 CSFNFL value on the same exam date
There are  0 patients that have -4 PLASMAPTAU181 value on the same exam date
There are  0 patients that have -4 ST32TA value on the same exam date
There are  0 patients that have -4 ST40TA value on the same exam date
There are  0 patients that have -4 ST58TA value on the same exam date
There are  0 patients that have -4 ST60TA value on the same exam date
There are  0 patients that have -4 ST62TA value on the same exam date
There are  0 patients that have -4 ST91TA value on the same exam date
There are

In [228]:
# There are data where for a patient, there are two values of APVOLUME on the same EXAMDATE. And one of them is -4
# -4 is the Missing data code used by ADNI
duplicates = (
    genetics_df.groupby(["RID", "EXAMDATE"])
    .filter(lambda x: len(x)> 1 and x["APVOLUME"].nunique()>1)
)
print("There are" , len(duplicates[duplicates["APVOLUME"] == -4]), "patients that have -4 APVOLUME value on the same exam date")

There are 22 patients that have -4 APVOLUME value on the same exam date


In [229]:
# Dropping -4 values when they are present on the same day as the actual APVOLUME of the patient
genetics_df = genetics_df.drop(duplicates[duplicates["APVOLUME"] == -4].index)
print(f"genetics RID: {genetics_df['RID'].nunique()} len: {len(genetics_df)}")

genetics RID: 2719 len: 8280


In [230]:
# There are 2 different values for ABETA42 for RID 42 Oon 2011-04-14
# There are many more examples like these
print("upenbiomk_df")
print(upenbiomk_df[upenbiomk_df["RID"] == 42])
print("fugi_abeta_df")
print(fugi_abeta_df[fugi_abeta_df["RID"] == 42])

upenbiomk_df
    RID    EXAMDATE  ABETA42    TAU   PTAU
38   42  2005-11-10   1258.0  250.1  18.23
39   42  2006-11-09    766.5  205.4  18.40
40   42  2008-08-18   1224.0  208.7  18.44
41   42  2010-03-18   1011.0  211.5  18.10
42   42  2011-04-14   1483.0  238.8  19.56
43   42  2013-01-24   1139.0  218.4  17.69
fugi_abeta_df
   RID    EXAMDATE  ABETA42  ABETA42_40
0   42  2011-04-14     1022       0.098


In [231]:
# Merge UPENN(Uni of Pennsylvania) and Fuji(Fujirebio) dataset
merged_upen_fuji = pd.merge(upenbiomk_df, fugi_abeta_df, on=["RID", "EXAMDATE"], suffixes=["_UPENN", "_FUJI"], how="outer")

In [232]:
# Replace NaNs with 0
merged_upen_fuji["ABETA42_UPENN"] = merged_upen_fuji["ABETA42_UPENN"].fillna(0)     
merged_upen_fuji["ABETA42_FUJI"] = merged_upen_fuji["ABETA42_FUJI"].fillna(0)

# Calculate mean
merged_upen_fuji["ABETA42"] =  np.where(
    # merged_upen_fuji[["ABETA42_UPENN", "ABETA42_FUJI"]].notna().all(axis=1) &     # Uncomment if not doing the above fillna(0)
    (merged_upen_fuji[["ABETA42_UPENN", "ABETA42_FUJI"]] != 0).all(axis=1),         # Calculatye mean when values in both columns are non-zero
    merged_upen_fuji[["ABETA42_UPENN", "ABETA42_FUJI"]].mean(axis=1),              
    np.where(                                                                       # If either of the value in the column is 0
        # merged_upen_fuji["ABETA42_UPENN"].notna() &                               # Uncomment if not doing the above fillna(0)
        merged_upen_fuji["ABETA42_UPENN"] != 0,                                     # Check if ABETA42_UPENN is non-zero
        merged_upen_fuji["ABETA42_UPENN"],                                          # Use ABETA42_UPENN if valid
        merged_upen_fuji["ABETA42_FUJI"]                                            # Else ABETA42_FUJI
    )
)

In [233]:
# Drop the ABETA42_UPENN and ABETA42_FUJI column
merged_upen_fuji.drop(columns=["ABETA42_UPENN", "ABETA42_FUJI"], inplace=True)

In [234]:
merged_upen_fuji[merged_upen_fuji["RID"] == 42]

Unnamed: 0,RID,EXAMDATE,TAU,PTAU,ABETA42_40,ABETA42
38,42,2005-11-10,250.1,18.23,,1258.0
39,42,2006-11-09,205.4,18.4,,766.5
40,42,2008-08-18,208.7,18.44,,1224.0
41,42,2010-03-18,211.5,18.1,,1011.0
42,42,2011-04-14,238.8,19.56,0.098,1252.5
43,42,2013-01-24,218.4,17.69,,1139.0


In [235]:
# Remove the UPENN(Uni of Pennsylvania) and Fuji(Fujirebio) dataset from the list of datasets
dfs = [df for df in dfs if not df.equals(upenbiomk_df) and not df.equals(fugi_abeta_df)]

# Add the merged dataset of UPENN(Uni of Pennsylvania) and Fuji(Fujirebio) to the list of datasets
dfs.append(merged_upen_fuji)

In [236]:
removed_rows = []                                                                                               # Remove it, its only there to check
filtered_rows_to_check = []

def filter_patients_within_six_months(group):
    # Ensure EXAMDATE is in datetime format
    group["EXAMDATE"] = pd.to_datetime(group["EXAMDATE"] , errors='coerce')

    # Sort by EXAMDATE
    group.sort_values(by="EXAMDATE", ascending=True, inplace=True)

    # List containing the rows that are atleast 6 months apart for each patient
    filtered_rows = []

    for _, row in group.iterrows():
        # Check if 'filtered_rows' is empty or if the difference in EXAMDATE is atleast 6 months
        if not filtered_rows or row["EXAMDATE"] >= filtered_rows[-1]["EXAMDATE"] + pd.DateOffset(months=6):
            filtered_rows.append(row)
            filtered_rows_to_check.append(row)
        else:                                                                                                   # Remove it, its only there to check
            removed_rows.append(row)
    
    return pd.DataFrame(filtered_rows)

for df in dfs:
    df = df.groupby("RID", group_keys=False).apply(filter_patients_within_six_months).reset_index(drop=True)

In [238]:
# Print patient with "RID"=125 that was removed because 
# the patient had an EXAMDATE within 6 months
print([s for s in removed_rows if s.get("RID") ==125])

[RID                         125
EXAMDATE    2011-05-26 00:00:00
APVOLUME                   -4.0
Name: 2702, dtype: object, RID                              125
EXAMDATE         2011-05-26 00:00:00
PLASMAPTAU181                 18.265
Name: 113, dtype: object]


In [239]:
# See for the exact date of 2011-05-26
in_filtered = [s for s in filtered_rows_to_check if s.get("RID") == 125]
in_filtered

[RID                         125
 EXAMDATE    2006-01-05 00:00:00
 APVOLUME                    6.0
 Name: 95, dtype: object,
 RID                         125
 EXAMDATE    2011-05-26 00:00:00
 APVOLUME                    8.0
 Name: 1580, dtype: object,
 RID                         125
 EXAMDATE    2012-06-27 00:00:00
 APVOLUME                   -4.0
 Name: 2606, dtype: object,
 RID                         125
 EXAMDATE    2013-06-05 00:00:00
 APVOLUME                   -4.0
 Name: 3846, dtype: object,
 RID                         125
 EXAMDATE    2015-01-22 00:00:00
 APVOLUME                   -4.0
 Name: 5268, dtype: object,
 RID                              125
 EXAMDATE         2010-03-02 00:00:00
 PLASMAPTAU181                 76.067
 Name: 111, dtype: object,
 RID                              125
 EXAMDATE         2011-05-26 00:00:00
 PLASMAPTAU181                  25.84
 Name: 112, dtype: object,
 RID                              125
 EXAMDATE         2012-06-27 00:00:00
 PLASMAPT

In [240]:
from functools import reduce

merged_df = reduce(lambda left, right: pd.merge(left, right, on=["RID", "EXAMDATE"], how='outer'), dfs)
len(merged_df)

16878

In [241]:
merged_df.describe()

Unnamed: 0,RID,APVOLUME,CSFNFL,PLASMAPTAU181,ST32TA,ST40TA,ST58TA,ST60TA,ST62TA,ST91TA,ST99TA,ST117TA,ST119TA,ST121TA,PTGENDER,MHPSYCH,MH2NEURL,MH14ALCH,MH15DRUG,MH16SMOK,TAU,PTAU,ABETA42_40,ABETA42
count,16878.0,6790.0,415.0,3819.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,5344.0,3168.0,3168.0,3168.0,3168.0,3168.0,3179.0,3167.0,446.0,3215.0
mean,4951.293874,1.340884,1465.816867,18.961121,2.717821,2.725335,2.574538,3.469113,2.210322,2.765076,2.780147,2.598437,3.505418,2.246651,1.426085,0.348801,0.314394,0.043876,0.00947,0.398043,286.838191,27.283227,0.065126,1034.281913
std,34353.643034,6.294932,1020.097069,14.719812,0.239194,0.229197,0.226311,0.455701,0.248271,0.236469,0.213069,0.209134,0.517448,0.263617,0.751417,0.476666,0.464347,0.204852,0.096866,0.489572,128.473103,14.27821,0.044928,607.768732
min,1.0,-4.0,405.0,0.362,1.613,1.579,1.575,1.341,1.363,1.575,1.626,1.528,1.238,1.308,-4.0,0.0,0.0,0.0,0.0,0.0,80.08,8.0,0.022,0.0
25%,1338.25,-4.0,967.5,11.105,2.581,2.615,2.442,3.243,2.047,2.639,2.678,2.472,3.282,2.075,1.0,0.0,0.0,0.0,0.0,0.0,195.65,17.165,0.04125,579.325
50%,4339.0,-4.0,1250.0,16.343,2.7435,2.754,2.592,3.5415,2.219,2.794,2.799,2.615,3.615,2.251,1.0,0.0,0.0,0.0,0.0,0.0,257.0,23.54,0.055,845.15
75%,5066.0,9.0,1644.0,23.5165,2.877,2.878,2.73075,3.77575,2.381,2.927,2.918,2.736,3.855,2.427,2.0,1.0,1.0,0.0,0.0,1.0,344.7,33.485,0.08975,1389.0
max,999999.0,10.0,12647.0,451.398,3.424,3.294,3.151,4.684,2.96,3.381,3.409,3.27,4.548,3.006,2.0,1.0,1.0,1.0,1.0,1.0,1018.0,108.5,0.827,4779.0


In [242]:
merged_df["RID"].nunique()

4382

In [243]:
# Ask if to include MMSE, as it shows weak associations with CSF levels in isolation
# Maybe, it might improve performance when analysed with other features??
adnimerge_path = "../data/ADNIMERGE_05Nov2024.csv"
adnimerge_columns = [
    "RID", 
    "EXAMDATE", 
    "MMSE"
]

adnimerge_df = pd.read_csv(adnimerge_path, usecols=adnimerge_columns)
print(adnimerge_df.head())

   RID    EXAMDATE  MMSE
0    2  2005-09-08  28.0
1    3  2005-09-12  20.0
2    3  2006-03-13  24.0
3    3  2006-09-12  17.0
4    3  2007-09-12  19.0
