# Data Processing

## Google Drive Authentication

In [None]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## UDS + MRI Loading

In [None]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz

# Note all file paths had to be removed in order to mantain privacy of patient data in accordance to the National Alzheimer's Coordinating Center licence agreement


uds_file_id = '[GoogleDriveFileId]'
downloaded = drive.CreateFile({'id': uds_file_id})
downloaded.GetContentFile("[GoogleDriveFileName].csv")

mri_scan_file_id = 'GoogleDriveFileId'
downloaded = drive.CreateFile({'id': mri_scan_file_id})
downloaded.GetContentFile("GoogleDriveFileName.csv")

In [None]:
# Read Data into Datafarmes

udslong=pd.read_csv('investigator_nacc65.csv',index_col=0,header=0)
mriscanlong=pd.read_csv('investigator_scan_mrisbm_nacc65.csv',index_col=0,header=0)

In [None]:
# Filter by NACCALZP to get all those afflicted with alzheimers or cognitively healthy

udslong = udslong[udslong['NACCALZP'].isin([1,2,8])].copy()
udslong

In [None]:
# Find the overlapping index values
overlapping_indices = udslong.index.intersection(mriscanlong.index)

# Print the number of overlapping index values
print('Number of overlapping index values (NACCID):', len(overlapping_indices))

# Output: 1237 Unique Patient IDs

In [None]:
# Find the overlapping index values
overlapping_indices = udslong.index.intersection(mriscanlong.index)

# Filter the udslong DataFrame to include only the overlapping indices
overlapping_udslong = udslong.loc[overlapping_indices]

# Calculate the distribution of NACCUDSD values
naccudsd_distribution = overlapping_udslong['NACCUDSD'].value_counts(normalize=True) * 100

# Print the distribution
print(naccudsd_distribution)

# Check percentage of remaining AD patients and controls

In [None]:
# Find the overlapping index values
overlapping_indices = udslong.index.intersection(mriscanlong.index)

# Filter both DataFrames to only include rows with overlapping NACCIDs
udslong_filtered = udslong.loc[overlapping_indices]
mriscanlong_filtered = mriscanlong.loc[overlapping_indices]

# Print the shapes of the filtered DataFrames to verify
print('Shape of filtered UDS DataFrame:', udslong_filtered.shape)
print('Shape of filtered MRI DataFrame:', mriscanlong_filtered.shape)

In [None]:
# Filter both DataFrames to include only the first instance of each NACCID
udslong_first_instance = udslong_filtered[~udslong_filtered.index.duplicated(keep='first')]
mriscanlong_first_instance = mriscanlong_filtered[~mriscanlong_filtered.index.duplicated(keep='first')]

# Print the shapes of the DataFrames with only the first instance of each NACCID
print('Shape of UDS DataFrame with first instance of each NACCID:', udslong_first_instance.shape)
print('Shape of MRI DataFrame with first instance of each NACCID:', mriscanlong_first_instance.shape)

## UDS Clean

### Dropping Columns

In [None]:
import pandas as pd

# Define the feature groups
input_demo_feats = ['SEX', 'HISPANIC', 'HISPOR', 'RACE', 'RACEX', 'PRIMLANG', 'EDUC', 'MARISTAT', 'NACCLIVS', 'INDEPEND', 'RESIDENC', 'NACCNIHR', 'NACCAGE']
input_fam_hist_feats = ['NACCFAM', 'NACCMOM', 'NACCDAD', 'NACCAM', 'NACCAMX', 'NACCAMS', 'NACCAMSX', 'NACCFM', 'NACCFMX', 'NACCFMS', 'NACCFMSX', 'NACCOM', 'NACCOMX', 'NACCOMS', 'NACCOMSX', 'NACCFADM', 'NACCFFTD']
input_patient_hist_feats = ['ANYMEDS', 'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'CVHATT', 'CVAFIB', 'CVANGIO', 'CVBYPASS', 'CVPACDEF', 'CVPACE', 'CVCHF', 'CVOTHR', 'CBSTROKE', 'NACCSTYR', 'CBTIA', 'NACCTIYR', 'SEIZURES', 'NACCTBI', 'TBI', 'TBIBRIEF', 'TRAUMBRF', 'TBIEXTEN', 'TRAUMEXT', 'TBIWOLOS', 'TRAUMCHR', 'TBIYEAR', 'NCOTHR', 'DIABETES', 'DIABTYPE', 'HYPERTEN', 'HYPERCHO', 'B12DEF', 'THYROID', 'ARTHRIT', 'ARTHTYPE', 'ARTHTYPX', 'ARTHUPEX', 'ARTHLOEX', 'ARTHSPIN', 'ARTHUNK', 'INCONTU', 'INCONTF', 'APNEA', 'RBD', 'INSOMN', 'OTHSLEEP', 'OTHSLEEX', 'ALCOHOL', 'ABUSOTHR', 'ABUSX', 'PTSD', 'BIPOLAR', 'SCHIZ', 'DEP2YRS', 'DEPOTHR', 'ANXIETY', 'OCD', 'NPSYDEV', 'PSYCDIS', 'PSYCDISX', 'NACCAAAS', 'NACCAANX', 'NACCAC', 'NACCACEI', 'NACCADEP', 'NACCAHTN', 'NACCAMD', 'NACCANGI', 'NACCAPSY', 'NACCBETA', 'NACCCCBS', 'NACCDBMD', 'NACCDIUR', 'NACCEMD', 'NACCEPMD', 'NACCHTNC', 'NACCLIPL', 'NACCNSD', 'NACCPDMD', 'NACCVASD']
input_physical_exam_feats = ['HEIGHT', 'WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'VISION', 'VISCORR', 'VISWCORR', 'HEARING', 'HEARAID', 'HEARWAID', 'NACCBMI', 'FOCLSYM', 'FOCLSIGN', 'NACCNREX', 'NORMEXAM']
input_npi_feats = ['NPIQINF', 'NPIQINFX', 'DEL', 'DELSEV', 'HALL', 'HALLSEV', 'AGIT', 'AGITSEV', 'DEPD', 'DEPDSEV', 'ANX', 'ANXSEV', 'ELAT', 'ELATSEV', 'APA', 'APASEV', 'DISN', 'DISNSEV', 'IRR', 'IRRSEV', 'MOT', 'MOTSEV', 'NITE', 'NITESEV', 'APP', 'APPSEV']
input_gds_feats = ['NOGDS', 'SATIS', 'DROPACT', 'EMPTY', 'BORED', 'SPIRITS', 'AFRAID', 'HAPPY', 'HELPLESS', 'STAYHOME', 'MEMPROB', 'WONDRFUL', 'WRTHLESS', 'ENERGY', 'HOPELESS', 'BETTER', 'NACCGDS']
input_faq_feats = ['BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 'MEALPREP', 'EVENTS', 'PAYATTN', 'REMDATES', 'TRAVEL', 'DECSUB']
input_np_feats = ['MMSEORDA', 'MMSEORLO', 'PENTAGON', 'NACCMMSE', 'LOGIMEM', 'MEMUNITS', 'MEMTIME', 'DIGIF', 'DIGIFLEN', 'DIGIB', 'DIGIBLEN', 'ANIMALS', 'VEG', 'TRAILA', 'TRAILARR', 'TRAILALI', 'TRAILB', 'TRAILBRR', 'TRAILBLI', 'BOSTON', 'MOCATOTS']
input_apoe_feats = ['NACCAPOE']
input_uds_feats = ['NACCUDSD']
input_cdr_feats = ['MEMORY', 'ORIENT', 'JUDGMENT', 'COMMUN', 'HOMEHOBB', 'PERSCARE', 'CDRSUM', 'CDRGLOB']

all_feats = input_demo_feats + input_fam_hist_feats + input_patient_hist_feats + input_physical_exam_feats + input_npi_feats + input_gds_feats + input_faq_feats + input_np_feats + input_apoe_feats + input_uds_feats + input_cdr_feats

# Filter the udslong_first_instance dataset to keep only the specified columns
udslong_filtered = udslong_first_instance[all_feats]

# Save the new dataset to a CSV file (optional)
udslong_filtered.to_csv('filtered_udslong.csv', index=True)

# Display the first few rows of the new dataset to verify
print(udslong_filtered)

In [None]:
import pandas as pd

# Drop the specified columns
columns_to_drop = ["NACCSTYR", "NACCTIYR", "TBIYEAR"] # 90% Missing Information
udslong_filtered = udslong_filtered.drop(columns=columns_to_drop)

# Print the names of the columns that were dropped
print("\nColumns dropped:")
print(columns_to_drop)

# Display the first few rows of the new dataset to verify
print("\nFirst few rows of the dataset after dropping columns:")
print(udslong_filtered.head())

In [None]:
import pandas as pd

# -4 = NA Value in NACC Data

# List of columns to analyze for -4 values
columns_with_neg4 = udslong_filtered.columns[(udslong_filtered == -4).any()]

# Calculate the total number of rows
total_rows = udslong_filtered.shape[0]

# Count the number of -4 values in each column
neg4_counts = (udslong_filtered[columns_with_neg4] == -4).sum()

# Calculate the percentage of -4 values in each column
neg4_percentages = (neg4_counts / total_rows) * 100

# Print the percentages
print("Percentage of -4 values in each feature:")
print(neg4_percentages)

# Return the result as a DataFrame for better visualization
neg4_percentages_df = neg4_percentages.reset_index()
neg4_percentages_df.columns = ['Feature', 'Percentage of -4s']

# Display the DataFrame
neg4_percentages_df

In [None]:
import pandas as pd

# List of columns to analyze for -4 values
columns_with_neg4 = udslong_filtered.columns[(udslong_filtered == -4).any()]

# Calculate the total number of rows
total_rows = udslong_filtered.shape[0]

# Count the number of -4 values in each column
neg4_counts = (udslong_filtered[columns_with_neg4] == -4).sum()

# Calculate the percentage of -4 values in each column
neg4_percentages = (neg4_counts / total_rows) * 100

# Identify columns with greater than 50% -4 values
columns_greater_than_50_percent = neg4_percentages[neg4_percentages > 50]

# Print the number of columns and the column names with greater than 50% -4 values
print(f"Number of columns with greater than 50% -4 values: {len(columns_greater_than_50_percent)}")
print("Columns with greater than 50% -4 values:")
print(columns_greater_than_50_percent)

# Return the result as a DataFrame for better visualization
columns_greater_than_50_percent_df = columns_greater_than_50_percent.reset_index()
columns_greater_than_50_percent_df.columns = ['Feature', 'Percentage of -4s']

# Display the DataFrame
columns_greater_than_50_percent_df

In [None]:
import pandas as pd

# Dropping Columns with over 50% missing Data

# List of columns to drop
columns_to_drop = [
    'CVPACE', 'TRAUMBRF', 'TRAUMEXT', 'TRAUMCHR', 'NCOTHR', 'FOCLSYM',
    'FOCLSIGN', 'MMSEORDA', 'MMSEORLO', 'PENTAGON', 'NACCMMSE', 'LOGIMEM',
    'MEMUNITS', 'MEMTIME', 'DIGIF', 'DIGIFLEN', 'DIGIB', 'DIGIBLEN', 'BOSTON',
    'NPIQINFX', 'PSYCDISX', 'ABUSX', 'OTHSLEEX', 'ARTHTYPX', 'NACCOMSX',
    'NACCOMX', 'NACCFMSX', 'NACCFMX', 'NACCAMSX', 'NACCAMX', 'RACEX'
]

# Drop the specified columns
udslong_filtered = udslong_filtered.drop(columns=columns_to_drop)

# Print the names of the columns that were dropped
print("\nColumns dropped:")
print(columns_to_drop)

# Display the first few rows of the new dataset to verify
print("\nFirst few rows of the dataset after dropping columns:")
print(udslong_filtered.head())

# If you want to save the updated DataFrame to a CSV file, uncomment the following line
# udslong_filtered.to_csv('udslong_filtered_dropped.csv', index=True)

### Data Imputation Using KNN

In [None]:
import pandas as pd
import numpy as np

# Convert -4 values to NA in the udslong_filtered DataFrame
udslong_filtered.replace(-4, np.nan, inplace=True)

# Display the first few rows of the updated DataFrame to verify
print("\nFirst few rows of the dataset after converting -4 to NA:")
print(udslong_filtered)

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

# Initialize the KNN imputer with the desired number of neighbors
knn_imputer = KNNImputer(n_neighbors=5)

# Apply the KNN imputer to the DataFrame
udslong_imputed = pd.DataFrame(knn_imputer.fit_transform(udslong_filtered), columns=udslong_filtered.columns, index=udslong_filtered.index)

# Display the first few rows of the imputed DataFrame to verify
print("\nFirst few rows of the dataset after KNN imputation:")
print(udslong_imputed.head())

In [None]:
# Save the imputed DataFrame to a CSV file
udslong_imputed.to_csv('udslong_imputed.csv', index=True)

# Print a message to confirm the file has been saved
print("The imputed dataset has been saved as 'udslong_imputed.csv'.")

## MRI Clean

In [None]:
import pandas as pd

# Define the corrected list of variables to keep
variables_to_keep = [
    'LH_CAUDALANTERIORCINGULATE_GVOL', 'RH_CAUDALANTERIORCINGULATE_GVOL', 'LH_CAUDALANTERIORCINGULATE_AVGTH', 'RH_CAUDALANTERIORCINGULATE_AVGTH',
    'LH_CAUDALMIDDLEFRONTAL_GVOL', 'RH_CAUDALMIDDLEFRONTAL_GVOL', 'LH_CAUDALMIDDLEFRONTAL_AVGTH', 'RH_CAUDALMIDDLEFRONTAL_AVGTH',
    'LH_CUNEUS_GVOL', 'RH_CUNEUS_GVOL', 'LH_CUNEUS_AVGTH', 'RH_CUNEUS_AVGTH',
    'LH_ENTORHINAL_GVOL', 'RH_ENTORHINAL_GVOL', 'LH_ENTORHINAL_AVGTH', 'RH_ENTORHINAL_AVGTH',
    'LH_FUSIFORM_GVOL', 'RH_FUSIFORM_GVOL', 'LH_FUSIFORM_AVGTH', 'RH_FUSIFORM_AVGTH',
    'LH_INFERIORPARIETAL_GVOL', 'RH_INFERIORPARIETAL_GVOL', 'LH_INFERIORPARIETAL_AVGTH', 'RH_INFERIORPARIETAL_AVGTH',
    'LH_INFERIORTEMPORAL_GVOL', 'RH_INFERIORTEMPORAL_GVOL', 'LH_INFERIORTEMPORAL_AVGTH', 'RH_INFERIORTEMPORAL_AVGTH',
    'LH_ISTHMUSCINGULATE_GVOL', 'RH_ISTHMUSCINGULATE_GVOL', 'LH_ISTHMUSCINGULATE_AVGTH', 'RH_ISTHMUSCINGULATE_AVGTH',
    'LH_LATERALOCCIPITAL_GVOL', 'RH_LATERALOCCIPITAL_GVOL', 'LH_LATERALOCCIPITAL_AVGTH', 'RH_LATERALOCCIPITAL_AVGTH',
    'LH_LATERALORBITOFRONTAL_GVOL', 'RH_LATERALORBITOFRONTAL_GVOL', 'LH_LATERALORBITOFRONTAL_AVGTH', 'RH_LATERALORBITOFRONTAL_AVGTH',
    'LH_LINGUAL_GVOL', 'RH_LINGUAL_GVOL', 'LH_LINGUAL_AVGTH', 'RH_LINGUAL_AVGTH',
    'LH_MEDIALORBITOFRONTAL_GVOL', 'RH_MEDIALORBITOFRONTAL_GVOL', 'LH_MEDIALORBITOFRONTAL_AVGTH', 'RH_MEDIALORBITOFRONTAL_AVGTH',
    'LH_MIDDLETEMPORAL_GVOL', 'RH_MIDDLETEMPORAL_GVOL', 'LH_MIDDLETEMPORAL_AVGTH', 'RH_MIDDLETEMPORAL_AVGTH',
    'LH_PARAHIPPOCAMPAL_GVOL', 'RH_PARAHIPPOCAMPAL_GVOL', 'LH_PARAHIPPOCAMPAL_AVGTH', 'RH_PARAHIPPOCAMPAL_AVGTH',
    'LH_PARACENTRAL_GVOL', 'RH_PARACENTRAL_GVOL', 'LH_PARACENTRAL_AVGTH', 'RH_PARACENTRAL_AVGTH',
    'LH_PARSOPERCULARIS_GVOL', 'RH_PARSOPERCULARIS_GVOL', 'LH_PARSOPERCULARIS_AVGTH', 'RH_PARSOPERCULARIS_AVGTH',
    'LH_PARSORBITALIS_GVOL', 'RH_PARSORBITALIS_GVOL', 'LH_PARSORBITALIS_AVGTH', 'RH_PARSORBITALIS_AVGTH',
    'LH_PARSTRIANGULARIS_GVOL', 'RH_PARSTRIANGULARIS_GVOL', 'LH_PARSTRIANGULARIS_AVGTH', 'RH_PARSTRIANGULARIS_AVGTH',
    'LH_PERICALCARINE_GVOL', 'RH_PERICALCARINE_GVOL', 'LH_PERICALCARINE_AVGTH', 'RH_PERICALCARINE_AVGTH',
    'LH_POSTCENTRAL_GVOL', 'RH_POSTCENTRAL_GVOL', 'LH_POSTCENTRAL_AVGTH', 'RH_POSTCENTRAL_AVGTH',
    'LH_POSTERIORCINGULATE_GVOL', 'RH_POSTERIORCINGULATE_GVOL', 'LH_POSTERIORCINGULATE_AVGTH', 'RH_POSTERIORCINGULATE_AVGTH',
    'LH_PRECENTRAL_GVOL', 'RH_PRECENTRAL_GVOL', 'LH_PRECENTRAL_AVGTH', 'RH_PRECENTRAL_AVGTH',
    'LH_PRECUNEUS_GVOL', 'RH_PRECUNEUS_GVOL', 'LH_PRECUNEUS_AVGTH', 'RH_PRECUNEUS_AVGTH',
    'LH_ROSTRALANTERIORCINGULATE_GVOL', 'RH_ROSTRALANTERIORCINGULATE_GVOL', 'LH_ROSTRALANTERIORCINGULATE_AVGTH', 'RH_ROSTRALANTERIORCINGULATE_AVGTH',
    'LH_ROSTRALMIDDLEFRONTAL_GVOL', 'RH_ROSTRALMIDDLEFRONTAL_GVOL', 'LH_ROSTRALMIDDLEFRONTAL_AVGTH', 'RH_ROSTRALMIDDLEFRONTAL_AVGTH',
    'LH_SUPERIORFRONTAL_GVOL', 'RH_SUPERIORFRONTAL_GVOL', 'LH_SUPERIORFRONTAL_AVGTH', 'RH_SUPERIORFRONTAL_AVGTH',
    'LH_SUPERIORPARIETAL_GVOL', 'RH_SUPERIORPARIETAL_GVOL', 'LH_SUPERIORPARIETAL_AVGTH', 'RH_SUPERIORPARIETAL_AVGTH',
    'LH_SUPERIORTEMPORAL_GVOL', 'RH_SUPERIORTEMPORAL_GVOL', 'LH_SUPERIORTEMPORAL_AVGTH', 'RH_SUPERIORTEMPORAL_AVGTH',
    'LH_SUPRAMARGINAL_GVOL', 'RH_SUPRAMARGINAL_GVOL', 'LH_SUPRAMARGINAL_AVGTH', 'RH_SUPRAMARGINAL_AVGTH',
    'LH_TRANSVERSETEMPORAL_GVOL', 'RH_TRANSVERSETEMPORAL_GVOL', 'LH_TRANSVERSETEMPORAL_AVGTH', 'RH_TRANSVERSETEMPORAL_AVGTH',
    'LH_INSULA_GVOL', 'RH_INSULA_GVOL', 'LH_INSULA_AVGTH', 'RH_INSULA_AVGTH'
]

# Ensure that NACCID is included in the variables to keep
variables_to_keep_with_index = variables_to_keep

# Filter the mriscanlong_first_instance DataFrame
mriscanlong_first_instance_filtered = mriscanlong_first_instance[variables_to_keep_with_index]

# Ensure NACCID is set as the index for both DataFrames
udslong_first_instance.index.name = 'NACCID'
mriscanlong_first_instance_filtered.index.name = 'NACCID'

# Merge NACCUDSD from udslong_first_instance based on the index
mriscanlong_first_instance_filtered['NACCUDSD'] = udslong_first_instance['NACCUDSD']

# Display the final DataFrame
print("Filtered MRI DataFrame with NACCUDSD:")
print(mriscanlong_first_instance_filtered)

In [None]:
# Save the imputed DataFrame to a CSV file
mriscanlong_first_instance_filtered.to_csv('mriscanclean.csv', index=True)

# Print a message to confirm the file has been saved
print("The imputed dataset has been saved as 'udslong_imputed.csv'.")

# Graph Neural Networks

In [None]:
# Install Pytorch Geometric Library
!pip install torch-geometric

## UDS Structured Graph Neural Network

In [None]:
udslong_imputed = 'GoogleDriveFileId'
downloaded = drive.CreateFile({'id': udslong_imputed})
downloaded.GetContentFile("GoogleDriveFileName.csv")

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

# Define the feature groups
input_demo_feats = ['SEX', 'HISPANIC', 'HISPOR', 'RACE', 'RACEX', 'PRIMLANG', 'EDUC', 'MARISTAT', 'NACCLIVS', 'INDEPEND', 'RESIDENC', 'NACCNIHR','NACCAGE']
input_fam_hist_feats = ['NACCFAM', 'NACCMOM', 'NACCDAD', 'NACCAM', 'NACCAMX', 'NACCAMS', 'NACCAMSX', 'NACCFM', 'NACCFMX', 'NACCFMS', 'NACCFMSX', 'NACCOM', 'NACCOMX', 'NACCOMS', 'NACCOMSX', 'NACCFADM', 'NACCFFTD']
input_patient_hist_feats = ['ANYMEDS', 'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'CVHATT', 'CVAFIB', 'CVANGIO', 'CVBYPASS', 'CVPACDEF', 'CVPACE', 'CVCHF', 'CVOTHR', 'CBSTROKE', 'NACCSTYR', 'CBTIA', 'NACCTIYR', 'SEIZURES', 'NACCTBI', 'TBI', 'TBIBRIEF', 'TRAUMBRF', 'TBIEXTEN', 'TRAUMEXT', 'TBIWOLOS', 'TRAUMCHR', 'TBIYEAR', 'NCOTHR', 'DIABETES', 'DIABTYPE', 'HYPERTEN', 'HYPERCHO', 'B12DEF', 'THYROID', 'ARTHRIT', 'ARTHTYPE', 'ARTHTYPX', 'ARTHUPEX', 'ARTHLOEX', 'ARTHSPIN', 'ARTHUNK', 'INCONTU', 'INCONTF', 'APNEA', 'RBD', 'INSOMN', 'OTHSLEEP', 'OTHSLEEX', 'ALCOHOL', 'ABUSOTHR', 'ABUSX', 'PTSD', 'BIPOLAR', 'SCHIZ', 'DEP2YRS', 'DEPOTHR', 'ANXIETY', 'OCD', 'NPSYDEV', 'PSYCDIS', 'PSYCDISX', 'NACCAAAS', 'NACCAANX', 'NACCAC', 'NACCACEI', 'NACCADEP', 'NACCAHTN', 'NACCAMD', 'NACCANGI', 'NACCAPSY', 'NACCBETA', 'NACCCCBS', 'NACCDBMD', 'NACCDIUR', 'NACCEMD', 'NACCEPMD', 'NACCHTNC', 'NACCLIPL', 'NACCNSD', 'NACCPDMD', 'NACCVASD']
input_physical_feats = ['HEIGHT','WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'VISION', 'VISCORR','VISWCORR', 'HEARING', 'HEARAID', 'HEARWAID','NACCBMI','FOCLSYM','FOCLSIGN','NACCNREX', 'NORMEXAM']
input_npi_feats = ['NPIQINF', 'NPIQINFX', 'DEL', 'DELSEV', 'HALL','HALLSEV', 'AGIT', 'AGITSEV', 'DEPD', 'DEPDSEV', 'ANX', 'ANXSEV','ELAT', 'ELATSEV', 'APA', 'APASEV', 'DISN', 'DISNSEV', 'IRR','IRRSEV', 'MOT', 'MOTSEV', 'NITE', 'NITESEV', 'APP', 'APPSEV']
input_gds_feats = ['NOGDS', 'SATIS', 'DROPACT', 'EMPTY', 'BORED', 'SPIRITS', 'AFRAID','HAPPY', 'HELPLESS', 'STAYHOME', 'MEMPROB', 'WONDRFUL', 'WRTHLESS','ENERGY', 'HOPELESS', 'BETTER', 'NACCGDS']
input_faq_feats = ['BILLS', 'TAXES','SHOPPING', 'GAMES', 'STOVE','MEALPREP', 'EVENTS', 'PAYATTN','REMDATES', 'TRAVEL', 'DECSUB']
input_np_feats = ['MMSEORDA','MMSEORLO', 'PENTAGON', 'NACCMMSE', 'LOGIMEM', 'MEMUNITS','MEMTIME', 'DIGIF', 'DIGIFLEN', 'DIGIB', 'DIGIBLEN', 'ANIMALS','VEG', 'TRAILA', 'TRAILARR', 'TRAILALI', 'TRAILB', 'TRAILBRR','TRAILBLI', 'BOSTON', 'MOCATOTS']
input_gene_feats = ['NACCAPOE']

# Combine all input feature groups into a single list
all_feats = (
    input_demo_feats + input_fam_hist_feats + input_patient_hist_feats +
    input_physical_feats + input_npi_feats + input_gds_feats +
    input_faq_feats + input_np_feats + input_gene_feats
)

# Variables to remove
variables_to_remove = [
    'NPIQINFX', 'PSYCDISX', 'ABUSX', 'OTHSLEEX', 'ARTHTYPX', 'NACCOMSX',
    'NACCOMX', 'NACCFMSX', 'NACCFMX', 'NACCAMSX', 'NACCAMX', 'RACEX',
    'CVPACE', 'TRAUMBRF', 'TRAUMEXT', 'TRAUMCHR', 'NCOTHR', 'FOCLSYM',
    'FOCLSIGN', 'MMSEORDA', 'MMSEORLO', 'PENTAGON', 'NACCMMSE', 'LOGIMEM',
    'MEMUNITS', 'MEMTIME', 'DIGIF', 'DIGIFLEN', 'DIGIB', 'DIGIBLEN', 'BOSTON', 'NACCSTYR', 'NACCTIYR', 'TBIYEAR'
]

# Filter out the variables to remove
filtered_feats = [feat for feat in all_feats if feat not in variables_to_remove]

# Initialize the adjacency matrix with zeros
adj_matrix = np.zeros((len(filtered_feats), len(filtered_feats)))

# Update the adjacency matrix: fully connected graph within each feature group
def update_adj_matrix(features, matrix):
    indices = [filtered_feats.index(feat) for feat in features if feat in filtered_feats]
    for i in indices:
        for j in indices:
            matrix[i][j] = 1

# Apply the adjacency update for each feature group
feature_groups = [
    input_demo_feats, input_fam_hist_feats, input_patient_hist_feats,
    input_physical_feats, input_npi_feats, input_gds_feats,
    input_faq_feats, input_np_feats, input_gene_feats
]

for group in feature_groups:
    update_adj_matrix(group, adj_matrix)

# Convert the adjacency matrix to a DataFrame for better visualization
adj_df = pd.DataFrame(adj_matrix, index=filtered_feats, columns=filtered_feats)

# Display the adjacency matrix
print("Adjacency Matrix:")
print(adj_df)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F

# Load the corrected dataset
dataset_path = 'udslong_imputed.csv'
uds_df = pd.read_csv(dataset_path)

# Binarize the NACCUDSD column
uds_df['NACCUDSD'] = uds_df['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0)

# Extract labels
labels = uds_df['NACCUDSD'].values

# Remove NACCUDSD and NACCID from the feature groups
features_df = uds_df.drop(columns=['NACCUDSD', 'NACCID'])

# Separate numeric and non-numeric columns
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
non_numeric_cols = features_df.select_dtypes(exclude=[np.number]).columns

# Encode non-numeric features
if len(non_numeric_cols) > 0:
    encoder = OneHotEncoder(sparse=False)
    encoded_features = encoder.fit_transform(features_df[non_numeric_cols])
    # Combine encoded non-numeric features with numeric features
    features_combined = np.hstack((features_df[numeric_cols].values, encoded_features))
else:
    features_combined = features_df[numeric_cols].values

# Normalize the numeric features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_combined)

# Reshape the features to (num_patients, num_regions, num_features)
num_patients = scaled_features.shape[0]
num_regions = scaled_features.shape[1]
num_features = 1

features = scaled_features.reshape(num_patients, num_regions, num_features)

# Create an adjacency matrix with only diagonal elements as 1
adj_matrix = np.eye(num_regions)

# Convert to PyTorch Geometric data
data_list = []
for i in range(num_patients):
    edge_index = np.array(np.nonzero(adj_matrix))
    x = torch.tensor(features[i], dtype=torch.float)
    y = torch.tensor([labels[i]], dtype=torch.long)
    data_list.append(Data(x=x, edge_index=torch.tensor(edge_index, dtype=torch.long), y=y))

# Define the GNN model with 4 layers
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 16)
        self.conv4 = GCNConv(16, 8)
        self.fc = torch.nn.Linear(8, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, batch)  # Aggregate node features to graph-level features
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Initialize model, loss, and optimizer
def initialize_model():
    model = GNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    return model, optimizer, criterion

# Training loop
def train(model, optimizer, criterion, train_loader):
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

# Evaluation function
def test(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for train_index, test_index in kf.split(data_list):
    train_data_list = [data_list[i] for i in train_index]
    test_data_list = [data_list[i] for i in test_index]

    train_loader = DataLoader(train_data_list, batch_size=20, shuffle=True)
    test_loader = DataLoader(test_data_list, batch_size=20, shuffle=False)

    model, optimizer, criterion = initialize_model()

    for epoch in range(10):
        train(model, optimizer, criterion, train_loader)

    test_acc = test(model, test_loader)
    fold_accuracies.append(test_acc)
    print(f'Test Accuracy for fold: {test_acc:.4f}')

average_accuracy = np.mean(fold_accuracies)
std_accuracy = np.std(fold_accuracies)
print(f'Average Test Accuracy: {average_accuracy:.4f}')
print(f'Standard Deviation of Test Accuracy: {std_accuracy:.4f}')

## UDS Unstructured Graph Neural Network

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F

# Load the corrected dataset
dataset_path = 'udslong_imputed.csv'
uds_df = pd.read_csv(dataset_path)

# Binarize the NACCUDSD column
uds_df['NACCUDSD'] = uds_df['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0)

# Extract labels
labels = uds_df['NACCUDSD'].values

# Remove NACCUDSD and NACCID from the feature groups
features_df = uds_df.drop(columns=['NACCUDSD', 'NACCID'])

# Separate numeric and non-numeric columns
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
non_numeric_cols = features_df.select_dtypes(exclude=[np.number]).columns

# Encode non-numeric features
if len(non_numeric_cols) > 0:
    encoder = OneHotEncoder(sparse=False)
    encoded_features = encoder.fit_transform(features_df[non_numeric_cols])
    # Combine encoded non-numeric features with numeric features
    features_combined = np.hstack((features_df[numeric_cols].values, encoded_features))
else:
    features_combined = features_df[numeric_cols].values

# Normalize the numeric features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_combined)

# Reshape the features to (num_patients, num_regions, num_features)
num_patients = scaled_features.shape[0]
num_regions = scaled_features.shape[1]
num_features = 1

features = scaled_features.reshape(num_patients, num_regions, num_features)

# Create an identity matrix as the adjacency matrix
adj_matrix = np.eye(num_regions)

# Convert to PyTorch Geometric data
data_list = []
for i in range(num_patients):
    edge_index = np.array(np.nonzero(adj_matrix))
    x = torch.tensor(features[i], dtype=torch.float)
    y = torch.tensor([labels[i]], dtype=torch.long)
    data_list.append(Data(x=x, edge_index=torch.tensor(edge_index, dtype=torch.long), y=y))

# Define the GNN model with 4 layers
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 16)
        self.conv4 = GCNConv(16, 8)
        self.fc = torch.nn.Linear(8, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, batch)  # Aggregate node features to graph-level features
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Initialize model, loss, and optimizer
def initialize_model():
    model = GNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    return model, optimizer, criterion

# Training loop
def train(model, optimizer, criterion, train_loader):
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

# Evaluation function
def test(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for train_index, test_index in kf.split(data_list):
    train_data_list = [data_list[i] for i in train_index]
    test_data_list = [data_list[i] for i in test_index]

    train_loader = DataLoader(train_data_list, batch_size=20, shuffle=True)
    test_loader = DataLoader(test_data_list, batch_size=20, shuffle=False)

    model, optimizer, criterion = initialize_model()

    for epoch in range(10):
        train(model, optimizer, criterion, train_loader)

    test_acc = test(model, test_loader)
    fold_accuracies.append(test_acc)
    print(f'Test Accuracy for fold: {test_acc:.4f}')

average_accuracy = np.mean(fold_accuracies)
std_accuracy = np.std(fold_accuracies)
print(f'Average Test Accuracy: {average_accuracy:.4f}')
print(f'Standard Deviation of Test Accuracy: {std_accuracy:.4f}')

## MRI Structured Graph Neural Network

In [None]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
mri_adjacency_matrix = 'GoogleDriveFileID'
downloaded = drive.CreateFile({'id': mri_adjacency_matrix})
downloaded.GetContentFile("GoogleDriveFileName.csv")

mriscanclean = 'GoogleDriveFileID'
downloaded = drive.CreateFile({'id': mriscanclean})
downloaded.GetContentFile("GoogleDriveFileName.csv")

In [None]:
mriadjacencymatrix=pd.read_csv('combined_adjacency_matrix.csv',index_col=0,header=0)
mriscanclean=pd.read_csv('mriscanclean.csv',index_col=0,header=0)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load the MRI dataset
mriscanclean = pd.read_csv('mriscanclean.csv', index_col=0)

# Preprocess the MRI dataset
vol_columns = [col for col in mriscanclean.columns if 'GVOL' in col]
thick_columns = [col for col in mriscanclean.columns if 'AVGTH' in col]
scaler_vol = StandardScaler()
scaler_thick = StandardScaler()
mriscanclean[vol_columns] = scaler_vol.fit_transform(mriscanclean[vol_columns])
mriscanclean[thick_columns] = scaler_thick.fit_transform(mriscanclean[thick_columns])
num_patients = mriscanclean.shape[0]
num_nodes = 62  # 31 regions for each hemisphere
num_features = 2  # gray volume and average thickness
mri_patient_features = np.zeros((num_patients, num_nodes, num_features))
for i in range(num_patients):
    mri_patient_features[i, :, 0] = mriscanclean.iloc[i][vol_columns].values
    mri_patient_features[i, :, 1] = mriscanclean.iloc[i][thick_columns].values
mri_labels = mriscanclean['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0).values

# Load the adjacency matrix
mri_adj_matrix = pd.read_csv('combined_adjacency_matrix.csv', index_col=0, header=0).values

class GNN(torch.nn.Module):
    def __init__(self, in_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)  # First GCN layer: in_channels -> 64 features
        self.conv2 = GCNConv(64, 32)           # Second GCN layer: 64 features -> 32 features
        self.conv3 = GCNConv(32, 16)           # Third GCN layer: 32 features -> 16 features
        self.conv4 = GCNConv(16, 8)            # Fourth GCN layer: 16 features -> 8 features
        self.fc = torch.nn.Linear(8, 1)        # Fully connected layer: 8 features -> 1 feature (output)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)                         # ReLU activation after first GCN layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)                         # ReLU activation after second GCN layer
        x = self.conv3(x, edge_index)
        x = F.relu(x)                         # ReLU activation after third GCN layer
        x = self.conv4(x, edge_index)
        x = F.relu(x)                         # ReLU activation after fourth GCN layer
        x = global_mean_pool(x, data.batch)   # Global mean pooling
        x = self.fc(x)
        return torch.sigmoid(x)               # Sigmoid activation in the fully connected layer

class BrainDataset(Dataset):
    def __init__(self, patient_features, adjacency_matrix, labels):
        self.patient_features = patient_features
        self.adjacency_matrix = adjacency_matrix
        self.labels = labels

    def __len__(self):
        return len(self.patient_features)

    def __getitem__(self, idx):
        x = torch.tensor(self.patient_features[idx], dtype=torch.float)
        edge_index = torch.tensor(np.array(np.nonzero(self.adjacency_matrix)), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for train_index, test_index in kf.split(mri_labels):
    # Prepare MRI data
    mri_train_features, mri_test_features = mri_patient_features[train_index], mri_patient_features[test_index]
    mri_train_labels, mri_test_labels = mri_labels[train_index], mri_labels[test_index]
    mri_train_dataset = BrainDataset(mri_train_features, mri_adj_matrix, mri_train_labels)
    mri_test_dataset = BrainDataset(mri_test_features, mri_adj_matrix, mri_test_labels)
    mri_train_loader = DataLoader(mri_train_dataset, batch_size=20, shuffle=True)
    mri_test_loader = DataLoader(mri_test_dataset, batch_size=20, shuffle=False)

    # Initialize model
    model = GNN(in_channels=2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()

    # Training loop
    model.train()
    for epoch in range(20):
        for data in mri_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in mri_test_loader:
            data = data.to(device)
            out = model(data)
            preds = (out > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    accuracies.append(accuracy)
    print(f'Fold Test Accuracy: {accuracy:.4f}')

# Print cross-validation results
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f'5-Fold Cross-Validation Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}')

## MRI Unstructured Graph Neural Network

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load the MRI dataset
mriscanclean = pd.read_csv('mriscanclean.csv', index_col=0)

# Preprocess the MRI dataset
vol_columns = [col for col in mriscanclean.columns if 'GVOL' in col]
thick_columns = [col for col in mriscanclean.columns if 'AVGTH' in col]
scaler_vol = StandardScaler()
scaler_thick = StandardScaler()
mriscanclean[vol_columns] = scaler_vol.fit_transform(mriscanclean[vol_columns])
mriscanclean[thick_columns] = scaler_thick.fit_transform(mriscanclean[thick_columns])
num_patients = mriscanclean.shape[0]
num_nodes = 62  # 31 regions for each hemisphere
num_features = 2  # gray volume and average thickness
mri_patient_features = np.zeros((num_patients, num_nodes, num_features))
for i in range(num_patients):
    mri_patient_features[i, :, 0] = mriscanclean.iloc[i][vol_columns].values
    mri_patient_features[i, :, 1] = mriscanclean.iloc[i][thick_columns].values
mri_labels = mriscanclean['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0).values

# Create an identity adjacency matrix for MRI
mri_adj_matrix = np.eye(num_nodes)  # Identity matrix for MRI

class GNN(torch.nn.Module):
    def __init__(self, in_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)  # First GCN layer: in_channels -> 64 features
        self.conv2 = GCNConv(64, 32)           # Second GCN layer: 64 features -> 32 features
        self.conv3 = GCNConv(32, 16)           # Third GCN layer: 32 features -> 16 features
        self.conv4 = GCNConv(16, 8)            # Fourth GCN layer: 16 features -> 8 features
        self.fc = torch.nn.Linear(8, 1)        # Fully connected layer: 8 features -> 1 feature (output)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)                         # ReLU activation after first GCN layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)                         # ReLU activation after second GCN layer
        x = self.conv3(x, edge_index)
        x = F.relu(x)                         # ReLU activation after third GCN layer
        x = self.conv4(x, edge_index)
        x = F.relu(x)                         # ReLU activation after fourth GCN layer
        x = global_mean_pool(x, data.batch)   # Global mean pooling
        x = self.fc(x)
        return torch.sigmoid(x)               # Sigmoid activation in the fully connected layer

class BrainDataset(Dataset):
    def __init__(self, patient_features, adjacency_matrix, labels):
        self.patient_features = patient_features
        self.adjacency_matrix = adjacency_matrix
        self.labels = labels

    def __len__(self):
        return len(self.patient_features)

    def __getitem__(self, idx):
        x = torch.tensor(self.patient_features[idx], dtype=torch.float)
        edge_index = torch.tensor(np.array(np.nonzero(self.adjacency_matrix)), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for train_index, test_index in kf.split(mri_labels):
    # Prepare MRI data
    mri_train_features, mri_test_features = mri_patient_features[train_index], mri_patient_features[test_index]
    mri_train_labels, mri_test_labels = mri_labels[train_index], mri_labels[test_index]
    mri_train_dataset = BrainDataset(mri_train_features, mri_adj_matrix, mri_train_labels)
    mri_test_dataset = BrainDataset(mri_test_features, mri_adj_matrix, mri_test_labels)
    mri_train_loader = DataLoader(mri_train_dataset, batch_size=20, shuffle=True)
    mri_test_loader = DataLoader(mri_test_dataset, batch_size=20, shuffle=False)

    # Initialize model
    model = GNN(in_channels=2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()

    # Training loop
    model.train()
    for epoch in range(20):
        for data in mri_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in mri_test_loader:
            data = data.to(device)
            out = model(data)
            preds = (out > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    accuracies.append(accuracy)
    print(f'Fold Test Accuracy: {accuracy:.4f}')

# Print cross-validation results
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f'5-Fold Cross-Validation Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}')

## UDS + MRI - Determining Best Combination of Weights

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score

# Load and preprocess UDS dataset
uds_df = pd.read_csv('udslong_imputed.csv')
uds_df['NACCUDSD'] = uds_df['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0)
uds_labels = uds_df['NACCUDSD'].values
uds_features_df = uds_df.drop(columns=['NACCUDSD', 'NACCID'])

numeric_cols = uds_features_df.select_dtypes(include=[np.number]).columns
non_numeric_cols = uds_features_df.select_dtypes(exclude=[np.number]).columns

if len(non_numeric_cols) > 0:
    encoder = OneHotEncoder(sparse=False)
    encoded_features = encoder.fit_transform(uds_features_df[non_numeric_cols])
    uds_features_combined = np.hstack((uds_features_df[numeric_cols].values, encoded_features))
else:
    uds_features_combined = uds_features_df[numeric_cols].values

scaler = StandardScaler()
scaled_uds_features = scaler.fit_transform(uds_features_combined)
uds_adj_matrix = np.eye(scaled_uds_features.shape[1])

# Load and preprocess MRI dataset
mriscanclean = pd.read_csv('mriscanclean.csv', index_col=0)
vol_columns = [col for col in mriscanclean.columns if 'GVOL' in col]
thick_columns = [col for col in mriscanclean.columns if 'AVGTH' in col]
scaler_vol = StandardScaler()
scaler_thick = StandardScaler()
mriscanclean[vol_columns] = scaler_vol.fit_transform(mriscanclean[vol_columns])
mriscanclean[thick_columns] = scaler_thick.fit_transform(mriscanclean[thick_columns])
num_patients = mriscanclean.shape[0]
num_nodes = 62
num_features = 2
mri_patient_features = np.zeros((num_patients, num_nodes, num_features))
for i in range(num_patients):
    mri_patient_features[i, :, 0] = mriscanclean.iloc[i][vol_columns].values
    mri_patient_features[i, :, 1] = mriscanclean.iloc[i][thick_columns].values
mri_labels = mriscanclean['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0).values
mri_adj_matrix = pd.read_csv('combined_adjacency_matrix.csv', index_col=0, header=0).values

class GNN(torch.nn.Module):
    def __init__(self, in_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 16)
        self.conv4 = GCNConv(16, 8)
        self.fc = torch.nn.Linear(8, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, data.batch)
        x = self.fc(x)
        return torch.sigmoid(x)

class BrainDataset(Dataset):
    def __init__(self, patient_features, adjacency_matrix, labels):
        self.patient_features = patient_features
        self.adjacency_matrix = adjacency_matrix
        self.labels = labels

    def __len__(self):
        return len(self.patient_features)

    def __getitem__(self, idx):
        x = torch.tensor(self.patient_features[idx], dtype=torch.float)
        edge_index = torch.tensor(np.array(np.nonzero(self.adjacency_matrix)), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

def initialize_model(in_channels):
    model = GNN(in_channels=in_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()
    return model, optimizer, criterion

def train(model, optimizer, criterion, loader):
    model.train()
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()

def test(model, loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for data in loader:
            out = model(data)
            preds.extend(out.cpu().numpy())
    return np.array(preds).flatten()

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_weights = [(i / 10, 1 - i / 10) for i in range(1, 10)]
best_combined_accuracy = 0
best_weights = (0, 0)

for uds_weight, mri_weight in all_weights:
    combined_accuracies = []

    for train_index, test_index in kf.split(uds_labels):
        # Prepare UDS data
        uds_train_features, uds_test_features = scaled_uds_features[train_index], scaled_uds_features[test_index]
        uds_train_labels, uds_test_labels = uds_labels[train_index], uds_labels[test_index]
        uds_train_data_list = [Data(x=torch.tensor(uds_train_features[i].reshape(-1, 1), dtype=torch.float),
                                    edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                                    y=torch.tensor([uds_train_labels[i]], dtype=torch.float)) for i in range(len(uds_train_labels))]
        uds_test_data_list = [Data(x=torch.tensor(uds_test_features[i].reshape(-1, 1), dtype=torch.float),
                                   edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                                   y=torch.tensor([uds_test_labels[i]], dtype=torch.float)) for i in range(len(uds_test_labels))]
        uds_train_loader = DataLoader(uds_train_data_list, batch_size=20, shuffle=True)
        uds_test_loader = DataLoader(uds_test_data_list, batch_size=20, shuffle=False)

        # Prepare MRI data
        mri_train_features, mri_test_features = mri_patient_features[train_index], mri_patient_features[test_index]
        mri_train_labels, mri_test_labels = mri_labels[train_index], mri_labels[test_index]
        mri_train_dataset = BrainDataset(mri_train_features, mri_adj_matrix, mri_train_labels)
        mri_test_dataset = BrainDataset(mri_test_features, mri_adj_matrix, mri_test_labels)
        mri_train_loader = DataLoader(mri_train_dataset, batch_size=20, shuffle=True)
        mri_test_loader = DataLoader(mri_test_dataset, batch_size=20, shuffle=False)

        # Initialize models
        uds_model, uds_optimizer, uds_criterion = initialize_model(1)
        mri_model, mri_optimizer, mri_criterion = initialize_model(2)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        uds_model.to(device)
        mri_model.to(device)

        # Train UDS model
        for epoch in range(10):
            train(uds_model, uds_optimizer, uds_criterion, uds_train_loader)

        # Train MRI model
        for epoch in range(20):
            train(mri_model, mri_optimizer, mri_criterion, mri_train_loader)

        # Get predictions for UDS and MRI
        uds_preds = test(uds_model, uds_test_loader)
        mri_preds = test(mri_model, mri_test_loader)

        # Combine predictions using the weights
        combined_preds = (uds_preds * uds_weight) + (mri_preds * mri_weight)
        combined_preds_binary = (combined_preds > 0.5).astype(int)
        combined_accuracy = accuracy_score(uds_test_labels, combined_preds_binary)

        combined_accuracies.append(combined_accuracy)

    mean_combined_accuracy = np.mean(combined_accuracies)
    std_combined_accuracy = np.std(combined_accuracies)
    print(f'Weights: UDS {uds_weight:.1f}, MRI {mri_weight:.1f} - Combined 5-Fold Accuracy: {mean_combined_accuracy:.4f} ± {std_combined_accuracy:.4f}')

    if mean_combined_accuracy > best_combined_accuracy:
        best_combined_accuracy = mean_combined_accuracy
        best_weights = (uds_weight, mri_weight)

print(f'Best Weights: UDS {best_weights[0]:.1f}, MRI {best_weights[1]:.1f} - Best Combined 5-Fold Accuracy: {best_combined_accuracy:.4f}')

## UDS + MRI Structured Graph Neural Network

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load the datasets
uds_df = pd.read_csv('udslong_imputed.csv')
mriscanclean = pd.read_csv('mriscanclean.csv', index_col=0)

# Preprocess the UDS dataset
uds_df['NACCUDSD'] = uds_df['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0)
uds_features = uds_df.drop(columns=['NACCUDSD'])
uds_labels = uds_df['NACCUDSD'].values
uds_features = uds_features.apply(pd.to_numeric, errors='coerce')
uds_features.fillna(0, inplace=True)
scaler_uds = StandardScaler()
scaled_uds_features = scaler_uds.fit_transform(uds_features.values)
uds_adj_matrix = np.eye(scaled_uds_features.shape[1])

# Preprocess the MRI dataset
vol_columns = [col for col in mriscanclean.columns if 'GVOL' in col]
thick_columns = [col for col in mriscanclean.columns if 'AVGTH' in col]
scaler_vol = StandardScaler()
scaler_thick = StandardScaler()
mriscanclean[vol_columns] = scaler_vol.fit_transform(mriscanclean[vol_columns])
mriscanclean[thick_columns] = scaler_thick.fit_transform(mriscanclean[thick_columns])
num_patients = mriscanclean.shape[0]
num_nodes = 62
num_features = 2
mri_patient_features = np.zeros((num_patients, num_nodes, num_features))
for i in range(num_patients):
    mri_patient_features[i, :, 0] = mriscanclean.iloc[i][vol_columns].values
    mri_patient_features[i, :, 1] = mriscanclean.iloc[i][thick_columns].values
mri_labels = mriscanclean['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0).values
mri_adj_matrix = pd.read_csv('combined_adjacency_matrix.csv', index_col=0, header=0).values

class GNN(torch.nn.Module):
    def __init__(self, in_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 16)
        self.conv4 = GCNConv(16, 8)
        self.fc = torch.nn.Linear(8, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, data.batch)
        x = self.fc(x)
        return torch.sigmoid(x)

class BrainDataset(Dataset):
    def __init__(self, patient_features, adjacency_matrix, labels):
        self.patient_features = patient_features
        self.adjacency_matrix = adjacency_matrix
        self.labels = labels

    def __len__(self):
        return len(self.patient_features)

    def __getitem__(self, idx):
        x = torch.tensor(self.patient_features[idx], dtype=torch.float)
        edge_index = torch.tensor(np.array(np.nonzero(self.adjacency_matrix)), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
combined_accuracies = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for train_index, test_index in kf.split(uds_labels):
    # Prepare UDS data
    uds_train_features, uds_test_features = scaled_uds_features[train_index], scaled_uds_features[test_index]
    uds_train_labels, uds_test_labels = uds_labels[train_index], uds_labels[test_index]
    uds_train_data_list = [Data(x=torch.tensor(uds_train_features[i].reshape(-1, 1), dtype=torch.float),
                                edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                                y=torch.tensor([uds_train_labels[i]], dtype=torch.float)) for i in range(len(uds_train_labels))]
    uds_test_data_list = [Data(x=torch.tensor(uds_test_features[i].reshape(-1, 1), dtype=torch.float),
                               edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                               y=torch.tensor([uds_test_labels[i]], dtype=torch.float)) for i in range(len(uds_test_labels))]
    uds_train_loader = DataLoader(uds_train_data_list, batch_size=20, shuffle=True)
    uds_test_loader = DataLoader(uds_test_data_list, batch_size=20, shuffle=False)

    # Prepare MRI data
    mri_train_features, mri_test_features = mri_patient_features[train_index], mri_patient_features[test_index]
    mri_train_labels, mri_test_labels = mri_labels[train_index], mri_labels[test_index]
    mri_train_dataset = BrainDataset(mri_train_features, mri_adj_matrix, mri_train_labels)
    mri_test_dataset = BrainDataset(mri_test_features, mri_adj_matrix, mri_test_labels)
    mri_train_loader = DataLoader(mri_train_dataset, batch_size=20, shuffle=True)
    mri_test_loader = DataLoader(mri_test_dataset, batch_size=20, shuffle=False)

    # Train UDS GNN
    uds_model = GNN(in_channels=1).to(device)
    optimizer = torch.optim.Adam(uds_model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()
    uds_model.train()
    for epoch in range(20):
        for data in uds_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = uds_model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Train MRI GNN
    mri_model = GNN(in_channels=2).to(device)
    optimizer = torch.optim.Adam(mri_model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()
    mri_model.train()
    for epoch in range(10):
        for data in mri_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = mri_model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Get predictions for UDS and MRI
    uds_model.eval()
    mri_model.eval()
    uds_preds = []
    mri_preds = []
    with torch.no_grad():
        for data in uds_test_loader:
            data = data.to(device)
            out = uds_model(data)
            uds_preds.extend(out.cpu().numpy())

        for data in mri_test_loader:
            data = data.to(device)
            out = mri_model(data)
            mri_preds.extend(out.cpu().numpy())

    uds_preds = np.array(uds_preds).flatten()
    mri_preds = np.array(mri_preds).flatten()

    # Weigh UDS predictions higher
    combined_preds = (uds_preds * 0.7) + (mri_preds * 0.3)
    combined_preds_binary = (combined_preds > 0.5).astype(int)
    combined_accuracy = accuracy_score(uds_test_labels, combined_preds_binary)
    combined_accuracies.append(combined_accuracy)
    print(f'Fold Test Accuracy: {combined_accuracy:.4f}')

# Print combined 5-fold cross-validation accuracy
mean_combined_accuracy = np.mean(combined_accuracies)
std_combined_accuracy = np.std(combined_accuracies)
print(f'Combined 5-Fold Cross-Validation Accuracy: {mean_combined_accuracy:.4f} ± {std_combined_accuracy:.4f}')

## UDS + MRI Unstructured Graph Neural Network

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load the datasets
uds_df = pd.read_csv('udslong_imputed.csv')
mriscanclean = pd.read_csv('mriscanclean.csv', index_col=0)

# Preprocess the UDS dataset
uds_df['NACCUDSD'] = uds_df['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0)
uds_features = uds_df.drop(columns=['NACCUDSD'])
uds_labels = uds_df['NACCUDSD'].values
uds_features = uds_features.apply(pd.to_numeric, errors='coerce')
uds_features.fillna(0, inplace=True)
scaler_uds = StandardScaler()
scaled_uds_features = scaler_uds.fit_transform(uds_features.values)
uds_adj_matrix = np.eye(scaled_uds_features.shape[1])

# Preprocess the MRI dataset
vol_columns = [col for col in mriscanclean.columns if 'GVOL' in col]
thick_columns = [col for col in mriscanclean.columns if 'AVGTH' in col]
scaler_vol = StandardScaler()
scaler_thick = StandardScaler()
mriscanclean[vol_columns] = scaler_vol.fit_transform(mriscanclean[vol_columns])
mriscanclean[thick_columns] = scaler_thick.fit_transform(mriscanclean[thick_columns])
num_patients = mriscanclean.shape[0]
num_nodes = 62
num_features = 2
mri_patient_features = np.zeros((num_patients, num_nodes, num_features))
for i in range(num_patients):
    mri_patient_features[i, :, 0] = mriscanclean.iloc[i][vol_columns].values
    mri_patient_features[i, :, 1] = mriscanclean.iloc[i][thick_columns].values
mri_labels = mriscanclean['NACCUDSD'].apply(lambda x: 1 if x in [2, 3, 4] else 0).values
mri_adj_matrix = pd.read_csv('combined_adjacency_matrix.csv', index_col=0, header=0).values

class GNN(torch.nn.Module):
    def __init__(self, in_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 16)
        self.conv4 = GCNConv(16, 8)
        self.fc = torch.nn.Linear(8, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, data.batch)
        x = self.fc(x)
        return torch.sigmoid(x)

class BrainDataset(Dataset):
    def __init__(self, patient_features, adjacency_matrix, labels):
        self.patient_features = patient_features
        self.adjacency_matrix = adjacency_matrix
        self.labels = labels

    def __len__(self):
        return len(self.patient_features)

    def __getitem__(self, idx):
        x = torch.tensor(self.patient_features[idx], dtype=torch.float)
        edge_index = torch.tensor(np.array(np.nonzero(self.adjacency_matrix)), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
combined_accuracies = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for train_index, test_index in kf.split(uds_labels):
    # Prepare UDS data
    uds_train_features, uds_test_features = scaled_uds_features[train_index], scaled_uds_features[test_index]
    uds_train_labels, uds_test_labels = uds_labels[train_index], uds_labels[test_index]
    uds_train_data_list = [Data(x=torch.tensor(uds_train_features[i].reshape(-1, 1), dtype=torch.float),
                                edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                                y=torch.tensor([uds_train_labels[i]], dtype=torch.float)) for i in range(len(uds_train_labels))]
    uds_test_data_list = [Data(x=torch.tensor(uds_test_features[i].reshape(-1, 1), dtype=torch.float),
                               edge_index=torch.tensor(np.array(np.nonzero(uds_adj_matrix)), dtype=torch.long),
                               y=torch.tensor([uds_test_labels[i]], dtype=torch.float)) for i in range(len(uds_test_labels))]
    uds_train_loader = DataLoader(uds_train_data_list, batch_size=20, shuffle=True)
    uds_test_loader = DataLoader(uds_test_data_list, batch_size=20, shuffle=False)

    # Prepare MRI data
    mri_train_features, mri_test_features = mri_patient_features[train_index], mri_patient_features[test_index]
    mri_train_labels, mri_test_labels = mri_labels[train_index], mri_labels[test_index]
    mri_train_dataset = BrainDataset(mri_train_features, mri_adj_matrix, mri_train_labels)
    mri_test_dataset = BrainDataset(mri_test_features, mri_adj_matrix, mri_test_labels)
    mri_train_loader = DataLoader(mri_train_dataset, batch_size=20, shuffle=True)
    mri_test_loader = DataLoader(mri_test_dataset, batch_size=20, shuffle=False)

    # Train UDS GNN
    uds_model = GNN(in_channels=1).to(device)
    optimizer = torch.optim.Adam(uds_model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()
    uds_model.train()
    for epoch in range(10):
        for data in uds_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = uds_model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Train MRI GNN
    mri_model = GNN(in_channels=2).to(device)
    optimizer = torch.optim.Adam(mri_model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss()
    mri_model.train()
    for epoch in range(20):
        for data in mri_train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = mri_model(data)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    # Get predictions for UDS and MRI
    uds_model.eval()
    mri_model.eval()
    uds_preds = []
    mri_preds = []
    with torch.no_grad():
        for data in uds_test_loader:
            data = data.to(device)
            out = uds_model(data)
            uds_preds.extend(out.cpu().numpy())

        for data in mri_test_loader:
            data = data.to(device)
            out = mri_model(data)
            mri_preds.extend(out.cpu().numpy())

    uds_preds = np.array(uds_preds).flatten()
    mri_preds = np.array(mri_preds).flatten()

    # Weigh UDS predictions higher
    combined_preds = (uds_preds * 0.7) + (mri_preds * 0.3)
    combined_preds_binary = (combined_preds > 0.5).astype(int)
    combined_accuracy = accuracy_score(uds_test_labels, combined_preds_binary)
    combined_accuracies.append(combined_accuracy)
    print(f'Fold Test Accuracy: {combined_accuracy:.4f}')

# Print combined 5-fold cross-validation accuracy
mean_combined_accuracy = np.mean(combined_accuracies)
std_combined_accuracy = np.std(combined_accuracies)
print(f'Combined 5-Fold Cross-Validation Accuracy: {mean_combined_accuracy:.4f} ± {std_combined_accuracy:.4f}')