In [6]:
import pandas as pd

# --- 1. Read the original file ---
df = pd.read_csv("depression_factors.tsv", sep="\t")

# --- 2. Normalise the column name and clean text ---
scale_col = "scale" if "scale" in df.columns else "Scale"
if scale_col not in df.columns:
    raise ValueError("No column named 'scale' or 'Scale' found.")

df[scale_col] = df[scale_col].str.strip().str.lower()

# --- 3. Map categories to binary labels (others -> NaN) ---
mapping = {
    "moderate depression": 1,
    "severe depression": 1,
    "normal/symptom absent": 0,
}

df["depression_binary"] = df[scale_col].map(mapping)

# --- 4. Keep only desired rows and columns ---
binary_df = df[["participant_id", "depression_binary"]].dropna()

# --- 5. Smoke / sanity checks ---
assert binary_df["participant_id"].is_unique, (
    "Duplicate participant IDs found after mapping."
)
assert not (df[scale_col] == "mild depression").isin(binary_df["participant_id"]).any(), (
    "Rows with mild depression slipped through the filter."
)

# --- 6. Write the new key ---
binary_df.to_csv("depression_factors_binary_withoutmild.tsv", sep="\t", index=False)
print(f"Saved binary labels for {len(binary_df)} participants.")


Saved binary labels for 168 participants.
