In [85]:
import pandas as pd

# Load the required csv files
registry_csv_file_path = "../data/REGISTRY_29Oct2024.csv"                   # For EXAMDATE and RID
genetics_csv_file_path = "../data/GENETIC_29Oct2024.csv"                        # For APVOLUME
upenbiomk_csv_file_path = "../data/UPENNBIOMK_ROCHE_ELECSYS_29Oct2024.csv"      # For ABETA42, TAU and PTAU
ugotptau181_csv_file_path = "../data/UGOTPTAU181_06_18_20_29Oct2024.csv"        # For PLASMAPTAU181
ucsffsx6_csv_file_path = "../data/UCSFFSX6_07_06_23_29Oct2024.csv"              # For Cortical thickness in certain temporal region
ucsffsx51_csv_file_path = "../data/UCSFFSL51_03_01_22_29Oct2024.csv"            # For Cortical thickness in ceratin temporal region
ptdemog_csv_file_path = "../data/PTDEMOG_29Oct2024.csv"                         # For Patient gender and DOB
medhist_csv_file_path = "../data/MEDHIST_29Oct2024.csv"                         # For medical history of patient


# Selected columns
registry_selected_columns = [
    "RID",
    "EXAMDATE"
]

genetics_selected_columns = [
    "RID",
    "VISDATE",
    "APVOLUME"    
]
upenbiomk_selected_columns = [
    "RID",
    "EXAMDATE",
    "ABETA42",
    "TAU",
    "PTAU"
]
ugotptau181_selected_columns = [
    "RID",
    "EXAMDATE",
    "PLASMAPTAU181"
]
ucsffsx6_selected_columns = [
    "RID",
    "EXAMDATE",
    "ST58TA",  # Cortical Thickness Average of Left Superior Temporal
    "ST117TA", # Cortical Thickness Average of Right Superior Temporal
    "ST40TA",  # Cortical Thickness Average of Left Middle Temporal
    "ST99TA",  # Cortical Thickness Average of Right Middle Temporal
    "ST32TA",  # Cortical Thickness Average of Left Inferior Temporal
    "ST91TA",  # Cortical Thickness Average of Right Inferior Temporal
    "ST60TA",  # Cortical Thickness Average of Left Temporal Pole
    "ST119TA", # Cortical Thickness Average of Right Temporal Pole
    "ST62TA",  # Cortical Thickness Average of Left Transverse Temporal
    "ST121TA"  # Cortical Thickness Average of Right Transverse Temporal
]
ucsffsx51_selected_columns = [
    "RID",
    "EXAMDATE",
    "ST58TA",  # Cortical Thickness Average of Left Superior Temporal
    "ST117TA", # Cortical Thickness Average of Right Superior Temporal
    "ST40TA",  # Cortical Thickness Average of Left Middle Temporal
    "ST99TA",  # Cortical Thickness Average of Right Middle Temporal
    "ST32TA",  # Cortical Thickness Average of Left Inferior Temporal
    "ST91TA",  # Cortical Thickness Average of Right Inferior Temporal
    "ST60TA",  # Cortical Thickness Average of Left Temporal Pole
    "ST119TA", # Cortical Thickness Average of Right Temporal Pole
    "ST62TA",  # Cortical Thickness Average of Left Transverse Temporal
    "ST121TA"  # Cortical Thickness Average of Right Transverse Temporal
]
ptdemog_selected_columns = [
    "RID",
    "VISDATE",
    "PTGENDER",
    "PTDOB"
]
medhist_selected_columns = [
    "RID",
    "VISDATE",
    "MH14ALCH",  # Alcohol Abuse
    "MH15DRUG",  # Drug Abuse
    "MH16SMOK",  # Smoking
    "MH2NEURL",  # Neurologic (other than AD)
    "MHPSYCH"   # Psychiatric Conditions
]

pd.set_option('display.max_columns', None)

registry_df = pd.read_csv(registry_csv_file_path, usecols=registry_selected_columns)
genetics_df = pd.read_csv(genetics_csv_file_path, usecols=genetics_selected_columns)
upenbiomk_df = pd.read_csv(upenbiomk_csv_file_path, usecols=upenbiomk_selected_columns)
ugotptau181_df = pd.read_csv(ugotptau181_csv_file_path, usecols=ugotptau181_selected_columns)
ucsffsx6_df = pd.read_csv(ucsffsx6_csv_file_path, usecols=ucsffsx6_selected_columns)
ucsffsx51_df = pd.read_csv(ucsffsx51_csv_file_path, usecols=ucsffsx51_selected_columns)
ptdemog_df = pd.read_csv(ptdemog_csv_file_path, usecols=ptdemog_selected_columns)
medhist_df = pd.read_csv(medhist_csv_file_path, usecols=medhist_selected_columns)

# dfs = [registry_df, genetics_df, upenbiomk_df, ugotptau181_df, ucsffsx51_df, ptdemog_df, medhist_df]
dfs = [genetics_df, upenbiomk_df, ugotptau181_df, ucsffsx51_df, ptdemog_df, medhist_df]

# print(f"registry {registry_df['RID'].nunique()} len: {len(registry_df)}")
print(f"genetics unique RIDs: {genetics_df['RID'].nunique()} len: {len(genetics_df)}")
print(f"upenbiomk unique RIDs: {upenbiomk_df['RID'].nunique()} len: {len(upenbiomk_df)}")
print(f"ugotptau181 unique RIDs: {ugotptau181_df['RID'].nunique()} len: {len(ugotptau181_df)}")
print(f"ucsffsx6 unique RIDs: {ucsffsx6_df['RID'].nunique()} len: {len(ucsffsx6_df)}")
print(f"ucsffsx51 unique RIDs: {ucsffsx51_df['RID'].nunique()} len: {len(ucsffsx51_df)}")
print(f"ptdemog unique RIDs: {ptdemog_df['RID'].nunique()} len: {len(ptdemog_df)}")
print(f"medhist unique RIDs: {medhist_df['RID'].nunique()} len: {len(medhist_df)}")

genetics unique RIDs: 2719 len: 8302
upenbiomk unique RIDs: 1660 len: 3174
ugotptau181 unique RIDs: 1191 len: 3758
ucsffsx6 unique RIDs: 1079 len: 2091
ucsffsx51 unique RIDs: 689 len: 3311
ptdemog unique RIDs: 4379 len: 5441
medhist unique RIDs: 2491 len: 3083


In [86]:
# Convert the column name from VISADATE TO EXAMDATE, as they mean the same
for df in dfs:
    if "VISDATE" in df.columns:
        df.rename(columns={"VISDATE":"EXAMDATE"}, inplace=True)
        print(df.columns)

Index(['RID', 'EXAMDATE', 'APVOLUME'], dtype='object')
Index(['RID', 'EXAMDATE', 'PTGENDER', 'PTDOB'], dtype='object')
Index(['RID', 'EXAMDATE', 'MHPSYCH', 'MH2NEURL', 'MH14ALCH', 'MH15DRUG',
       'MH16SMOK'],
      dtype='object')


In [92]:
for df in dfs:
    for columns in df.columns:
        if columns not in ["RID", "EXAMDATE"]:
            duplicates = (
                df.groupby(["RID", "EXAMDATE"])
                .filter(lambda x: len(x)> 1 and x[columns].nunique()>1)
            )
            print(f"There are  {len(duplicates[duplicates[columns] == -4])} patients that have -4 {columns} value on the same exam date")

There are  22 patients that have -4 APVOLUME value on the same exam date
There are  0 patients that have -4 ABETA42 value on the same exam date
There are  0 patients that have -4 TAU value on the same exam date
There are  0 patients that have -4 PTAU value on the same exam date
There are  0 patients that have -4 PLASMAPTAU181 value on the same exam date
There are  0 patients that have -4 ST32TA value on the same exam date
There are  0 patients that have -4 ST40TA value on the same exam date
There are  0 patients that have -4 ST58TA value on the same exam date
There are  0 patients that have -4 ST60TA value on the same exam date
There are  0 patients that have -4 ST62TA value on the same exam date
There are  0 patients that have -4 ST91TA value on the same exam date
There are  0 patients that have -4 ST99TA value on the same exam date
There are  0 patients that have -4 ST117TA value on the same exam date
There are  0 patients that have -4 ST119TA value on the same exam date
There are  0

In [61]:
# There are data where for a patient, there are two values of APVOLUME on the same EXAMDATE. And one of them is -4
# -4 is the Missing data code used by ADNI
duplicates = (
    genetics_df.groupby(["RID", "EXAMDATE"])
    .filter(lambda x: len(x)> 1 and x["APVOLUME"].nunique()>1)
)
print("There are" , len(duplicates[duplicates["APVOLUME"] == -4]), "patients that have -4 APVOLUME value on the same exam date")

There are 22 patients that have -4 APVOLUME value on the same exam date


In [62]:
# Dropping -4 values when they are present on the same day as the actual APVOLUME of teh patient
genetics_df = genetics_df.drop(duplicates[duplicates["APVOLUME"] == -4].index)
print(f"genetics RID: {genetics_df['RID'].nunique()} len: {len(genetics_df)}")

genetics RID: 2719 len: 8280


In [63]:
removed_rows = []
filtered_rows_to_check = []
def filter_patients_within_six_months(group):
    group["EXAMDATE"] = pd.to_datetime(group["EXAMDATE"] , errors='coerce')
    group.sort_values(by="EXAMDATE", ascending=True, inplace=True)

    filtered_rows = []
    for i, row in group.iterrows():
        if not filtered_rows:
            filtered_rows.append(row)
            filtered_rows_to_check.append(row)
        else:
            if row["EXAMDATE"] >= filtered_rows[-1]["EXAMDATE"] + pd.DateOffset(months=6):
                filtered_rows.append(row)
                filtered_rows_to_check.append(row)
            else:
                removed_rows.append(row)
    
    return pd.DataFrame(filtered_rows)


for df in dfs:
    groups = df.groupby("RID")
    df = df.groupby("RID", group_keys=False).apply(filter_patients_within_six_months).reset_index(drop=True)

In [65]:
# Print patient with "RID"=125 that was removed because 
# the patient had an EXAMDATE within 6 months
print([s for s in removed_rows if s.get("RID") ==125])

[RID                         125
EXAMDATE    2011-05-26 00:00:00
APVOLUME                   -4.0
Name: 2702, dtype: object, RID                              125
EXAMDATE         2011-05-26 00:00:00
PLASMAPTAU181                 18.265
Name: 113, dtype: object]


In [66]:
# See for the exact date of 2011-05-06
in_filtyered = [s for s in filtered_rows_to_check if s.get("RID") == 125]
in_filtyered

[RID                         125
 EXAMDATE    2006-01-05 00:00:00
 APVOLUME                    6.0
 Name: 95, dtype: object,
 RID                         125
 EXAMDATE    2011-05-26 00:00:00
 APVOLUME                    8.0
 Name: 1580, dtype: object,
 RID                         125
 EXAMDATE    2012-06-27 00:00:00
 APVOLUME                   -4.0
 Name: 2606, dtype: object,
 RID                         125
 EXAMDATE    2013-06-05 00:00:00
 APVOLUME                   -4.0
 Name: 3846, dtype: object,
 RID                         125
 EXAMDATE    2015-01-22 00:00:00
 APVOLUME                   -4.0
 Name: 5268, dtype: object,
 RID                              125
 EXAMDATE         2010-03-02 00:00:00
 PLASMAPTAU181                 76.067
 Name: 111, dtype: object,
 RID                              125
 EXAMDATE         2011-05-26 00:00:00
 PLASMAPTAU181                  25.84
 Name: 112, dtype: object,
 RID                              125
 EXAMDATE         2012-06-27 00:00:00
 PLASMAPT

In [93]:
from functools import reduce

merged_df = reduce(lambda left, right: pd.merge(left, right, on=["RID", "EXAMDATE"], how='outer'), dfs)
len(merged_df)

16607

In [94]:
merged_df.describe()

Unnamed: 0,RID,APVOLUME,ABETA42,TAU,PTAU,PLASMAPTAU181,ST32TA,ST40TA,ST58TA,ST60TA,ST62TA,ST91TA,ST99TA,ST117TA,ST119TA,ST121TA,PTGENDER,MHPSYCH,MH2NEURL,MH14ALCH,MH15DRUG,MH16SMOK
count,16607.0,6790.0,3181.0,3173.0,3161.0,3819.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,5344.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,3816.77082,1.340884,1061.864162,286.645953,27.261468,18.961121,2.717821,2.725335,2.574538,3.469113,2.210322,2.765076,2.780147,2.598437,3.505418,2.246651,1.426085,0.348801,0.314394,0.043876,0.00947,0.398043
std,2321.169464,6.294932,619.382028,128.375529,14.263413,14.719812,0.239194,0.229197,0.226311,0.455701,0.248271,0.236469,0.213069,0.209134,0.517448,0.263617,0.751417,0.476666,0.464347,0.204852,0.096866,0.489572
min,1.0,-4.0,203.0,80.08,8.0,0.362,1.613,1.579,1.575,1.341,1.363,1.575,1.626,1.528,1.238,1.308,-4.0,0.0,0.0,0.0,0.0,0.0
25%,1417.5,-4.0,592.8,195.6,17.16,11.105,2.581,2.615,2.442,3.243,2.047,2.639,2.678,2.472,3.282,2.075,1.0,0.0,0.0,0.0,0.0,0.0
50%,4356.0,-4.0,871.9,256.9,23.53,16.343,2.7435,2.754,2.592,3.5415,2.219,2.794,2.799,2.615,3.615,2.251,1.0,0.0,0.0,0.0,0.0,0.0
75%,5078.0,9.0,1423.0,344.6,33.48,23.5165,2.877,2.878,2.73075,3.77575,2.381,2.927,2.918,2.736,3.855,2.427,2.0,1.0,1.0,0.0,0.0,1.0
max,10324.0,10.0,4779.0,1018.0,108.5,451.398,3.424,3.294,3.151,4.684,2.96,3.381,3.409,3.27,4.548,3.006,2.0,1.0,1.0,1.0,1.0,1.0


In [14]:
merged_df.to_csv("output.csv", index=False)

In [17]:
merged_df["RID"].nunique()

4381