## Setup

Run the following in the base project directory.

Optionally, setup a virtual environment
```
python -m venv .venv
```

Install jupyter
```
pip install jupyter ipykernel
python -m ipykernel install --user --name=.venv --display-name "Python (.venv)"
```

In [None]:
# Install requirements

%pip install pandas
%pip install numpy

In [9]:
import pandas as pd
import numpy as np
import string

In [21]:
# Load CSV
df = pd.read_csv("input.csv")

# Define the specific column mappings
specified_columns = [
    "Present in Mothership?", "PARTICIPANT ID (PIDN)", "Phenotype Incompatibilities", "Recruitment Category", 
    "The family member of the proband is:", "Proband's PIDN", "Relation to the proband", "If other, specify", 
    "Clinical diagnosis of AD or mild/moderate FTD (including behavioral variant FTD; non-fluent, semantic, or logopenic PPA; FTD-ALS; PSP; CBD) performed by the principal investigator using local diagnostic standards (relying on current criteria for AD and FTD). Patients may show radiological criteria related to atrophy in routine MRI or hypoperfusion/hypometabolism on SPECT or PET when available.",
    "Healthy individuals with a CDR of 0 and MMSE > 25", "Age between 40 - 80 years", "18 years of age or older", 
    "Family aggregation (at least three direct relatives with dementia in at least two generations) or a family member of a participant who tested positive for a genetic mutation and has undergone counseling.", 
    "Known diagnosis of another significant cognitive, neurological, or psychiatric disease (multiple system atrophy, multiple sclerosis, prion disease, Huntington's disease)", "Family history of AD or FTD", "Participant's age", "Participant's country of birth", 
    "If other, specify", "What sex was assigned to you at birth (the sex that appears on your birth certificate)?", "If other, specify", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=White)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Black)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Hispanic/Latino)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=American Indian/Alaska Native)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Asian)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Native Hawaiian/Pacific Islander)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=None of the above)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=I identify with another racial/ethnic category: {demo_race_other})", 
    "Which race/ethnicity do you belong to?", "The participant belongs to:", "At what age did the participant present the first symptoms?", 
    "4a. What diagnosis was given to this participant at your center? This should represent your team's best estimate based on experience and available information. This will help better capture the diagnostic criteria that participants with atypical phenotypes meet.", 
    "4b. Enter the variant type for this participant with Alzheimer's Disease", "4c. Enter the variant type for this participant with Frontotemporal Dementia", 
    "5. Does the participant meet the criteria for dementia? DEMENTIA - ALL CAUSES (All participants who are experimental cases should meet this criterion, and the answer should be 'Yes'. Only family members might not meet it, and they could have either of the two answers: 'No' or 'Yes').", 
    "Phenotype Consensus (Not affected = 0, AD= 1, FTD=2, MCI=3,OTHER=4 No data = -9)", "Has a pedigree been introduced for this participant in the Progeny database?", 
    "Please upload a de-identified version of the pedigree here. Please ensure that no names or birth dates are included in the file.", 
    "If a pedigree has not been uploaded, do you plan to upload one in the future?", "Has sequencing/genotyping begun?", "Has sequencing/genotyping been completed?", 
    "Has data processing begun?", "Has data processing been completed?", "Are results available?", "Have results been shared back to the site?"
]


# Generate the new column names using letters A, B, C, ..., Z, AA, AB, etc.
def generate_column_names(n):
    letters = list(string.ascii_uppercase)
    col_names = []
    for i in range(n):
        if i < 26:
            col_names.append(letters[i])
        else:
            col_names.append(letters[(i // 26) - 1] + letters[i % 26])
    return col_names

# Map specified columns to sequential letters
original_col_to_letter = {old: new for old, new in zip(specified_columns, generate_column_names(len(specified_columns)))}
letter_to_original_col = {new: old for old, new in zip(specified_columns, generate_column_names(len(specified_columns)))}


# Apply column renaming
df.rename(columns=original_col_to_letter, inplace=True)

In [17]:
""" 
Define classify function as per flowchart.

phenotype: {0, 1, 2, 3, 4, -9, exclude, flag}
"""

from typing import Literal

PhenotypeType = Literal[0, 1, 2, 3, 4, -9, None]
StatusType = Literal["pass", "exclude", "flag"]
ExcludeReasonType = str
FlagReasonType = list[str]

def classify(row: pd.Series) -> tuple[PhenotypeType, StatusType, ExcludeReasonType, FlagReasonType]:
    if pd.isna(row["D"]):
        return (None, "exclude", letter_to_original_col["D"], [])
    
    isHealthyControl = False
    flagReasons = []
    flag = False

    if row["D"] in  ["Família", "Familiar"]:
        # Follow "Family Member" path

        # Check F and G and flag if missing
        if pd.isna(row["F"]) or pd.isna(row["G"]):
            flag = True
            if pd.isna(row["F"]): flagReasons.append(letter_to_original_col["F"])
            if pd.isna(row["G"]): flagReasons.append(letter_to_original_col["G"])
        
        # Check E (family member is control or has dementia)
        if pd.isna(row["E"]) or row["E"] == "Familiar indeterminado.":
            return (-9, "flag" if flag else "pass", "", flagReasons)
        isHealthyControl = row["E"] == "Familiar Saludable."
    else:
        # Determine whether patient is control or has dementia
        isHealthyControl = row["D"] in ["Controle saudável", "Control sano"]

        # Check I
        if isHealthyControl and row["I"] in ["Si", "Sim"]:
            return (None, "exclude", letter_to_original_col["I"], flagReasons)
        if not isHealthyControl:
            if row["I"] in ["No", "Não"]:
                return (None, "exclude", letter_to_original_col["I"], flagReasons)
            if pd.isna(row["I"]):
                flag = True
                flagReasons.append(letter_to_original_col["I"])
        
        # Check J
        if isHealthyControl:
            if row["J"] in ["No", "Não"]:
                return (None, "exclude", letter_to_original_col["J"], flagReasons)
            if pd.isna(row["J"]):
                flag = True
                flagReasons.append(letter_to_original_col["J"])
        if not isHealthyControl and row["J"] in ["Si", "Sim"]:
            return (None, "exclude", letter_to_original_col["J"], flagReasons)

    
    # Check AD (at which point we also consider Family Members)
    if isHealthyControl:
        if row["AD"] in ["Qualquer um dos grupos experimentais", "a alguno de los grupos experimentales"]:
            return (None, "exclude", letter_to_original_col["AD"], flagReasons)
        if pd.isna(row["AD"]):
            flag = True
            flagReasons.append(letter_to_original_col["AD"])
    if not isHealthyControl:
        if row["AD"] in ["al grupo control", "Grupo de controle"]:
            return (None, "exclude", letter_to_original_col["AD"], flagReasons)
        if pd.isna(row["AD"]):
            flag = True
            flagReasons.append(letter_to_original_col["AD"])
        
    # Check AE
    if isHealthyControl and not pd.isna(row["AE"]):
        flag = True
        flagReasons.append(letter_to_original_col["AE"])
    if not isHealthyControl and pd.isna(row["AE"]):
        flag = True
        flagReasons.append(letter_to_original_col["AE"])

    # Check AF
    if isHealthyControl and not pd.isna(row["AF"]):
        if "DA" in row["AF"] or "DFT" in row["AF"]: # Check if AD/FTD (letter's flipped due to Spanish)
            return (None, "exclude", letter_to_original_col["AF"], flagReasons)
        else:
            flag = True
            flagReasons.append(letter_to_original_col["AF"])
    if not isHealthyControl and pd.isna(row["AF"]):
        return (None, "exclude", letter_to_original_col["AF"], flagReasons)
    
    # Now let's handle the two branches separately since they are not so symmetric

    if isHealthyControl:
        # Control group

        # Check AG/AH
        if not pd.isna(row["AG"]) or not pd.isna(row["AH"]):
            return (None, "exclude", letter_to_original_col["AH"], flagReasons)
        
        # Check AI
        if row["AI"] in ["Si", "Sim"]:
            return (None, "exclude", letter_to_original_col["AI"], flagReasons)
        
        return (0, "flag" if flag else "pass", "", flagReasons) # Found valid phenotype
    else:
        # Dementia Patient

        # Check AF
        if row["AF"] in ["DA - Doença de Alzheimer", "EA - Enfermedad de Alzheimer"]:
            # Check AG
            if pd.isna("AG"):
                flag = True
                flagReasons.append(letter_to_original_col["AG"])
            
            if row["AI"] in ["Si", "Sim"]:
                return (1, "flag" if flag else "pass", "", flagReasons) # Found valid phenotype
        elif row["AF"] in ["DFT - Demencia frontotemporal", "DFT - Demência Frontotemporal"]:
            # Check AH
            if pd.isna("AH"):
                flag = True
                flagReasons.append(letter_to_original_col["AH"])
            
            if row["AI"] in ["Si", "Sim"]:
                return (2, "flag" if flag else "pass", "", flagReasons) # Found valid phenotype
        else:
            return (4, "flag" if flag else "pass", "", flagReasons) # Found valid phenotype

        # AI is no or missing. Is recruitment catagory "Familiar"?
        if row["D"] in  ["Família", "Familiar"]:
            return (3, "flag" if flag else "pass", "", flagReasons) # Found valid phenotype
        else:
            return (None, "exclude", letter_to_original_col["D"], flagReasons)



In [None]:
# Apply classification and write output

def get_output(row: pd.Series) -> pd.Series:
    phenotype, status, excludeReason, flagReasons = classify(row)

    # Row B is the PIDN
    return pd.Series([row["B"], phenotype, status, excludeReason, flagReasons])

output_df = df.apply(get_output, axis=1)
output_df.columns = ["PARTICIPANT ID (PIDN)", "Phenotype", "Status", "Exclude Reason", "Flag Reason"]
output_df.to_csv("output.csv", index=False)

In [None]:
"""
Verify that the output is correct
"""

# Load the original df
df = pd.read_csv("input.csv")

# Load the output.csv which has the classification results
output_df = pd.read_csv("output.csv")

# Ensure both dataframes have the required columns
assert "PARTICIPANT ID (PIDN)" in df.columns and "Phenotype Consensus (Not affected = 0, AD= 1, FTD=2, MCI=3,OTHER=4 No data = -9)" in df.columns, "df is missing required columns"
assert "PARTICIPANT ID (PIDN)" in output_df.columns and "Phenotype" in output_df.columns, "output_df is missing required columns"

# Merge on "PARTICIPANT ID (PIDN)" to align predictions with actual values
merged_df = df.merge(output_df, on="PARTICIPANT ID (PIDN)", how="inner")

# Compare Phenotype from output.csv with AJ in df
merged_df["Correct"] = merged_df["Phenotype"] == merged_df["Phenotype Consensus (Not affected = 0, AD= 1, FTD=2, MCI=3,OTHER=4 No data = -9)"]

# Filter out rows where "Status" is "exclude" or "flag"
valid_rows = merged_df[~merged_df["Status"].isin(["exclude", "flag"])]

# Compare Phenotype from output.csv with AJ in df
valid_rows["Correct"] = valid_rows["Phenotype"] == valid_rows["Phenotype Consensus (Not affected = 0, AD= 1, FTD=2, MCI=3,OTHER=4 No data = -9)"]

# Get mismatched rows
mismatches = valid_rows[~valid_rows["Correct"]]

# Write mismatches to incorrect.csv
mismatches.to_csv("incorrect.csv", index=False)

# Print summary
accuracy = valid_rows["Correct"].mean()
print(f"Accuracy: {accuracy:.2%}")
