## Setup

Run the following in the base project directory.

Optionally, setup a virtual environment
```
python -m venv .venv
```

Install jupyter
```
pip install jupyter ipykernel
python -m ipykernel install --user --name=.venv --display-name "Python (.venv)"
```

In [1]:
# Validate venv activation
!where python

# On linux
# !which python

c:\Users\dylan\Documents\Coding\bio-lab\redlat-genetics\.venv\Scripts\python.exe
C:\Python312\python.exe
C:\Users\dylan\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
# Install requirements

%pip install pandas
%pip install numpy

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.2.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached numpy-2.2.3-cp312-cp312-win_amd64.whl (12.6 MB)
Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.2.3 pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import string

In [4]:
""" 
Define classify function as per flowchart.

phenotype: {0, 1, 2, 3, 4, -9, exclude, flag}
"""

from typing import Literal

PhenotypeType = Literal[0, 1, 2, 3, 4, -9, "exclude", "flag"]

def classify(row: pd.Series) -> PhenotypeType:
    if pd.isna(row["D"]):
        return "exclude"
    
    isHealthyControl = False

    if row["D"] in  ["Família", "Familiar"]:
        # Follow "Family Member" path

        # Check F and G and flag if missing
        if pd.isna(row["F"]) or pd.isna(row["G"]):
            return "flag"
        
        # Check E (family member is control or has dementia)
        if pd.isna(row["E"]) or row["E"] == "Familiar indeterminado.":
            return -9
        isHealthyControl = row["E"] == "Familiar Saludable."
    else:
        # Determine whether patient is control or has dementia
        isHealthyControl = row["D"] in ["Controle saudável", "Control sano"]

        # Check I
        if isHealthyControl and row["I"] in ["Si", "Sim"]:
            return "exclude"
        if not isHealthyControl:
            if row["I"] in ["No", "Não"]:
                return "exclude"
            if pd.isna(row["I"]):
                return "flag"
        
        # Check J
        if isHealthyControl:
            if row["J"] in ["No", "Não"]:
                return "exclude"
            if pd.isna(row["J"]):
                return "flag"
        if not isHealthyControl and row["J"] in ["Si", "Sim"]:
            return "exclude"

    
    # Check AD (at which point we also consider Family Members)
    if isHealthyControl:
        if row["AD"] in ["Qualquer um dos grupos experimentais", "a alguno de los grupos experimentales"]:
            return "exclude"
        if pd.isna(row["AD"]):
            return "flag"
    if not isHealthyControl:
        if row["AD"] in ["al grupo control", "Grupo de controle"]:
            return "exclude"
        if pd.isna(row["AD"]):
            return "flag"
        
    # Check AE
    if isHealthyControl and not pd.isna(row["AE"]):
        return "flag"
    if not isHealthyControl and pd.isna(row["AE"]):
        return "flag"

    # Check AF
    if isHealthyControl and not pd.isna(row["AF"]):
        if "DA" in row["AF"] or "DFT" in row["AF"]: # Check if AD/FTD (letter's flipped due to Spanish)
            return "exclude"
        else:
            return "flag"
    if not isHealthyControl and pd.isna(row["AF"]):
        return "exclude"
    
    # Now let's handle the two branches separately since they are not so symmetric

    if isHealthyControl:
        # Control group

        # Check AG/AH
        if not pd.isna(row["AG"]) or not pd.isna(row["AH"]):
            return "exclude"
        if row["AI"] in ["Si", "Sim"]:
            return "exclude"
        
        return 0 # Found valid phenotype
    else:
        # Dementia Patient

        # Check AF
        if row["AF"] in ["DA - Doença de Alzheimer", "EA - Enfermedad de Alzheimer"]:
            # Check AG
            if pd.isna("AG"):
                return "flag"
            
            if row["AI"] in ["Si", "Sim"]:
                return 1 # Found valid phenotype
        elif row["AF"] in ["DFT - Demencia frontotemporal", "DFT - Demência Frontotemporal"]:
            # Check AH
            if pd.isna("AH"):
                return "flag"
            
            if row["AI"] in ["Si", "Sim"]:
                return 2 # Found valid phenotype
        else:
            return 4 # Found valid phenotype

        # AI is no or missing. Is recruitment catagory "Familiar"?
        if row["D"] in  ["Família", "Familiar"]:
            return 3 # Found valid phenotype
        else:
            return "exclude"



In [5]:
# Load CSV
df = pd.read_csv("input.csv")

# Define the specific column mappings
specified_columns = [
    "Present in Mothership?", "PARTICIPANT ID (PIDN)", "Phenotype Incompatibilities", "Recruitment Category", 
    "The family member of the proband is:", "Proband's PIDN", "Relation to the proband", "If other, specify", 
    "Clinical diagnosis of AD or mild/moderate FTD (including behavioral variant FTD; non-fluent, semantic, or logopenic PPA; FTD-ALS; PSP; CBD) performed by the principal investigator using local diagnostic standards (relying on current criteria for AD and FTD). Patients may show radiological criteria related to atrophy in routine MRI or hypoperfusion/hypometabolism on SPECT or PET when available.",
    "Healthy individuals with a CDR of 0 and MMSE > 25", "Age between 40 - 80 years", "18 years of age or older", 
    "Family aggregation (at least three direct relatives with dementia in at least two generations) or a family member of a participant who tested positive for a genetic mutation and has undergone counseling.", 
    "Known diagnosis of another significant cognitive, neurological, or psychiatric disease (multiple system atrophy, multiple sclerosis, prion disease, Huntington's disease)", "Family history of AD or FTD", "Participant's age", "Participant's country of birth", 
    "If other, specify", "What sex was assigned to you at birth (the sex that appears on your birth certificate)?", "If other, specify", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=White)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Black)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Hispanic/Latino)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=American Indian/Alaska Native)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Asian)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=Native Hawaiian/Pacific Islander)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=None of the above)", 
    "With which of the following American census racial/ethnic categories do you identify? Select all that apply (choice=I identify with another racial/ethnic category: {demo_race_other})", 
    "Which race/ethnicity do you belong to?", "The participant belongs to:", "At what age did the participant present the first symptoms?", 
    "4a. What diagnosis was given to this participant at your center? This should represent your team's best estimate based on experience and available information. This will help better capture the diagnostic criteria that participants with atypical phenotypes meet.", 
    "4b. Enter the variant type for this participant with Alzheimer's Disease", "4c. Enter the variant type for this participant with Frontotemporal Dementia", 
    "5. Does the participant meet the criteria for dementia? DEMENTIA - ALL CAUSES (All participants who are experimental cases should meet this criterion, and the answer should be 'Yes'. Only family members might not meet it, and they could have either of the two answers: 'No' or 'Yes').", 
    "Phenotype Consensus (Not affected = 0, AD= 1, FTD=2, MCI=3,OTHER=4 No data = -9)", "Has a pedigree been introduced for this participant in the Progeny database?", 
    "Please upload a de-identified version of the pedigree here. Please ensure that no names or birth dates are included in the file.", 
    "If a pedigree has not been uploaded, do you plan to upload one in the future?", "Has sequencing/genotyping begun?", "Has sequencing/genotyping been completed?", 
    "Has data processing begun?", "Has data processing been completed?", "Are results available?", "Have results been shared back to the site?"
]

# Generate the new column names using letters A, B, C, ..., Z, AA, AB, etc.
def generate_column_names(n):
    letters = list(string.ascii_uppercase)
    col_names = []
    for i in range(n):
        if i < 26:
            col_names.append(letters[i])
        else:
            col_names.append(letters[(i // 26) - 1] + letters[i % 26])
    return col_names

# Map specified columns to sequential letters
column_mapping = {old: new for old, new in zip(specified_columns, generate_column_names(len(specified_columns)))}

# Apply column renaming
df.rename(columns=column_mapping, inplace=True)

In [6]:
# Apply classification and write output

def get_output(row: pd.Series) -> pd.Series:
    classification = classify(row)

    phenotype = None # Invalid
    status = classification if classification in ["exclude", "flag"] else "pass"

    if status == "pass":
        phenotype = classification
    
    # Row B is the PIDN
    return pd.Series([row["B"], phenotype, status])

output_df = df.apply(get_output, axis=1)
output_df.columns = ["PARTICIPANT ID (PIDN)", "Phenotype", "Status"]
output_df.to_csv("output.csv", index=False)

In [7]:
"""
Verify that the classify function works
"""

# Apply the classify function to each row
df["Predicted"] = df.apply(lambda row: classify(row), axis=1)

# Filter out "exclude" and "flag"
valid_rows = df[~df["Predicted"].isin(["exclude", "flag"])]

# Compare predictions with the actual values
valid_rows["Correct"] = valid_rows["Predicted"] == valid_rows["AJ"]

# Get mismatched rows
mismatches = valid_rows[~valid_rows["Correct"]]

# Write mismatches to incorrect.csv to analyze
with open("incorrect.csv", "w") as f:
    f.write(mismatches.iloc[:].to_csv())

# Print summary
accuracy = valid_rows["Correct"].mean()
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 99.66%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_rows["Correct"] = valid_rows["Predicted"] == valid_rows["AJ"]
