code by: Micah Williams

Clustering Analysis to Identify Potential ADHD Underdiagnosis in Females

Goals:
1. Using all participants to find "control" females that appear ADHD-like compared to all data (with and without functional connectome (FC) data)
2. Using Female participants only to account for possible sex differences in ADHD expression (with and without FC)

Methods:
*   KNN- identify "control" females with primarily ADHD neighbors
*   KMeans clustering - identify "control" females in ADHD-like cluster + visualize
*   Hierarchical clustering - identify "control" females in ADHD-like cluster + visualize
*   For all clustering approaches, compare potentially misdiagnosed controls in M vs F to see if rates are similar

Combining Results:
*  Create df with all potentially misdiagnosed "control" F to see how many participants are consistently flagged



In [1]:
#all imports
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#load data
from google.colab import drive
drive.mount('/content/drive/')
file_path = '/content/drive/My Drive/MicahWIDSdata/'
behavioral_data = pd.read_excel(file_path+'TRAIN_QUANTITATIVE_METADATA_.xlsx')
demographic_data = pd.read_excel(file_path + 'TRAIN_CATEGORICAL_METADATA_new.xlsx')
label_data = pd.read_excel(file_path+'TRAINING_SOLUTIONS.xlsx')
connectome = pd.read_csv(file_path + 'FC_extracted.csv')
connectome = connectome.drop(columns = ['Unnamed: 0'])

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
#helper functions
def map_clusters_and_get_flags(clustering_df, method, flagged_ids, run_name):
  """maps clusters onto ADHD (1) vs control (0) using mean ADHD_Outcome in each group
  (for visualization),then flags all non-ADHD participants assigned to the "ADHD-like" cluster,
  and adds then to flagged_ids df"""

  cluster_means = clustering_df.groupby(method)['ADHD_Outcome'].mean()
  adhd_cluster = cluster_means.idxmax()
  control_cluster = cluster_means.idxmin()
  clustering_df[method] = clustering_df[method].map({adhd_cluster: 1, control_cluster: 0})

  #get non-ADHD females in "ADHD" cluster
  flags = clustering_df[(clustering_df['Sex_F']==1) & (clustering_df['ADHD_Outcome'] == 0) & (clustering_df[method]==1)]
  flagged_ids[f'{method}_{run_name}'] = flags['participant_id'].astype(str).tolist()

def plot_clusters_PCA(X_scaled, clustering_df, method, file_path, run_name):
  """Applies PCA for dimension reduction and plots cluster labels vs. ADHD
  outcomes from the data """
  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)
  df = clustering_df.copy()
  df['ADHD_Label'] = df['ADHD_Outcome'].map({0: '0 - Other', 1: '1 - ADHD'})
  df['PC1'] = X_pca[:,0]
  df['PC2'] = X_pca[:,1]
  plt.figure(figsize=(10,8))
  sns.scatterplot(data=df, x='PC1', y='PC2', hue='ADHD_Outcome', style=method, palette='Set1')
  plt.title(f"{method} Clustering vs ADHD labels (PCA) {run_name}" )
  plt.savefig(f"{file_path}/{method}_clustering_PCA_{run_name}.png")
  plt.close()

def plot_clusters_umap(X_scaled, clustering_df, method, file_path, run_name):
  """Applies UMAP for dimension reduction and plots cluster labels vs. ADHD
  outcomes from the data """
  reducer = umap.UMAP()
  X_umap = reducer.fit_transform(X_scaled)
  df = clustering_df.copy()
  df['ADHD_Label'] = df['ADHD_Outcome'].map({0: '0 - Other', 1: '1 - ADHD'})
  df['UMAP1'] = X_umap[:, 0]
  df['UMAP2'] = X_umap[:, 1]
  plt.figure(figsize=(10, 8))
  sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='ADHD_Outcome', style=method, palette='Set1')
  plt.title(f"{method} Clustering vs ADHD labels (UMAP) {run_name}")
  plt.savefig(f"{file_path}/{method}_clustering_UMAP_{run_name}.png")
  plt.close()

def confusion_matrix_analysis(clustering_df, method, file_path, run_name):
  """creates and saves confusion matrices separately for female and male
  (if all-participant run) participants to compare rates of potentially
  mislablelling ADHD-like individuals"""
  females = clustering_df[clustering_df['Sex_F'] ==1]
  females_cm = confusion_matrix(females['ADHD_Outcome'], females[method], labels=[0, 1])
  plt.figure(figsize=(5, 4))
  sns.heatmap(females_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["Control", "ADHD"], yticklabels=["Control", "ADHD"])
  plt.title(f"{method} Confusion Matrix (F) {run_name}")
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.tight_layout()
  plt.savefig(f"{file_path}/{method}_conf_matrix_F_{run_name}.png")
  plt.close()

  if "all" in run_name:
    males = clustering_df[clustering_df['Sex_F'] ==0]
    males_cm = confusion_matrix(males['ADHD_Outcome'], males[method], labels=[0, 1])
    plt.figure(figsize=(5, 4))
    sns.heatmap(males_cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=["Control", "ADHD"], yticklabels=["Control", "ADHD"])
    plt.title(f"{method} Confusion Matrix (M) {run_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"{file_path}/{method}_conf_matrix_M_{run_name}.png")
    plt.close()

In [5]:
#pipeline for analyses (knn, kmeans, hierarchical)

#initialize one dict to store identified potential underdiagnosed F participants
flagged_ids={}
random_seed=0
def clustering_analysis(feature_df, run_name , flagged_ids, file_path):
  """1. Makes folder to save this run's analyses, initializes clustering result df
     2. imputes and scales X for clustering
     3. Uses KNN to identify "control" females that have primarily "ADHD" neighbors
     4. Uses hierarchical clustering to identify "control" females that fall in
     "ADHD-like" cluster, evaluates clustering, plots w PCA and UMAP
     5.Uses Kmeans clustering to identify "control" females that fall in
     "ADHD-like" cluster, evaluates clustering, plots w PCA and UMAP
     6. Creates confusion matrices """

  folder_path = os.path.join(file_path, run_name)
  os.makedirs(folder_path, exist_ok=True)

  #initialize clustering results df
  clustering_df = feature_df[['participant_id', 'Sex_F', 'ADHD_Outcome']].copy()

  #prep X- impute Nans w KNNImputer, then scale
  X = feature_df.drop(columns = ['participant_id','Sex_F', 'ADHD_Outcome', 'MRI_Track_Age_at_Scan'])
  imputer = KNNImputer(n_neighbors=5)
  X_imputed = imputer.fit_transform(X)
  X_scaled = StandardScaler().fit_transform(X_imputed)

  #1. KNN
  knn = NearestNeighbors(n_neighbors=6)
  knn.fit(X_scaled)
  #want to check neighbors for each "non-ADHD" female participant -> flag if majority are ADHD
  control_f = feature_df[(feature_df['ADHD_Outcome']==0) & (feature_df['Sex_F'] ==1)].index
  knn_flags=[]
  for participant in control_f:
    _, indices = knn.kneighbors([X_scaled[participant]])
    neighbor_labels = feature_df.iloc[indices[0][1:]]['ADHD_Outcome'].values
    most_common_label = Counter(neighbor_labels).most_common(1)[0][0]
    if most_common_label ==1:
          knn_flags.append(feature_df.loc[participant, 'participant_id'])

  flagged_ids[f'knn_{run_name}'] = knn_flags

  #2. Hierarchical clustering (Agglomerative)
  agg_clustering = AgglomerativeClustering(n_clusters=2)
  clustering_df['Hierarchical'] = agg_clustering.fit_predict(X_scaled)
  #evaluate
  agg_sscore = silhouette_score(X_scaled, agg_clustering.labels_ )
  agg_dbscore = davies_bouldin_score(X_scaled, agg_clustering.labels_)
  print("Hierarchical Silhouette score:", agg_sscore, "Davies-Bouldin score:", agg_dbscore)
  #if visualizing, run next 2 lines
  map_clusters_and_get_flags(clustering_df, 'Hierarchical', flagged_ids, run_name)
  plot_clusters_PCA(X_scaled, clustering_df, 'Hierarchical', folder_path, run_name)
  plot_clusters_umap(X_scaled, clustering_df, 'Hierarchical', folder_path, run_name)

  #3. KMeans clustering
  kmeans = KMeans(n_clusters=2, random_state = random_seed)
  clustering_df['Kmeans'] = kmeans.fit_predict(X_scaled)
  #evaluate
  kmeans_sscore = silhouette_score(X_scaled, kmeans.labels_ )
  kmeans_dbscore = davies_bouldin_score(X_scaled, kmeans.labels_)
  print("Kmeans Silhouette score:", kmeans_sscore, "Davies-Bouldin score:", kmeans_dbscore)
  #if visualizing, run next 2 lines
  map_clusters_and_get_flags(clustering_df, 'Kmeans', flagged_ids, run_name)
  plot_clusters_PCA(X_scaled, clustering_df, 'Kmeans', folder_path, run_name)
  plot_clusters_umap(X_scaled, clustering_df, 'Kmeans', folder_path, run_name)


  #4. Confusion matrix comparison
  confusion_matrix_analysis(clustering_df, 'Hierarchical', folder_path, run_name)
  confusion_matrix_analysis(clustering_df, 'Kmeans', folder_path, run_name)

In [None]:
#RUN- All participants, without FC data
feature_df = pd.merge(label_data, behavioral_data, how = 'left')
clustering_analysis(feature_df, "all participants" , flagged_ids, file_path)

In [None]:
#RUN- F participants, without FC data
f_labels = label_data[label_data['Sex_F']==1]
feature_df = pd.merge(f_labels, behavioral_data, how = 'left')
clustering_analysis(feature_df, "female only" , flagged_ids, file_path)

In [None]:
#NOT INCLUDED IN FINAL ANALYSIS- All participants, with FC data
behavioral_label = pd.merge(label_data, behavioral_data, how = 'left')
feature_df = pd.merge(behavioral_label, connectome, how='left')
clustering_analysis(feature_df, "all participants, FC included" , flagged_ids, file_path)

In [None]:
#NOT INCLUDED IN FINAL ANALYSIS- F participants, with FC data
f_labels = label_data[label_data['Sex_F']==1]
f_behavioral_label = pd.merge(f_labels, behavioral_data, how = 'left')
feature_df = pd.merge(f_behavioral_label, connectome, how='left')
clustering_analysis(feature_df, "female only, FC included" , flagged_ids, file_path)

In [8]:
#Create df of all flagged participants (control F predicted as ADHD), including number of times flagged
all_ids = set()
for ids in flagged_ids.values():
    all_ids.update(str(id_) for id_ in ids)

flagged_df = pd.DataFrame({'participant_id': list(all_ids)})

for method, ids in flagged_ids.items():
    ids_set = set(str(id_) for id_ in ids)
    flagged_df[method] = flagged_df['participant_id'].apply(lambda x: 1 if x in ids_set else 0)
flagged_df['num_flags'] = flagged_df.drop(columns='participant_id').sum(axis=1)
flagged_df.to_csv(file_path + 'all_flagged_participants.csv', index=False)