In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install nilearn


Collecting nilearn
  Downloading nilearn-0.12.0-py3-none-any.whl.metadata (9.9 kB)
Downloading nilearn-0.12.0-py3-none-any.whl (10.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nilearn
Successfully installed nilearn-0.12.0


In [None]:
import sys
import os
sys.path.append('/content/drive/MyDrive/')
main_path = '/content/drive/MyDrive/ABIDE'
if not os.path.exists(main_path):
  os.makedirs(main_path)


In [None]:
# download subject_IDs.txt
phenotypic_path = main_path + '/phenotypic_image_quality'
if not os.path.exists(phenotypic_path):
  os.makedirs(phenotypic_path)
!gdown 1-71vU67genAR0h6QwJdBzkeAfnOSfqHE --output /content/drive/MyDrive/ABIDE/phenotypic_image_quality/



Downloading...
From: https://drive.google.com/uc?id=1-71vU67genAR0h6QwJdBzkeAfnOSfqHE
To: /content/drive/MyDrive/ABIDE/phenotypic_image_quality/subject_IDs.txt
  0% 0.00/5.23k [00:00<?, ?B/s]100% 5.23k/5.23k [00:00<00:00, 17.0MB/s]


In [None]:
# Prepare AAL data

from nilearn import datasets
from nilearn import connectome
import scipy.io as sio
import numpy as np
import shutil

def get_ids(data_folder,num_subjects=None):
    """
    return:
        subject_IDs    : list of all subject IDs
    """
    subject_IDs = np.genfromtxt(os.path.join(data_folder, 'subject_IDs.txt'), dtype=str)
    if num_subjects is not None:
        subject_IDs = subject_IDs[:num_subjects]
    return subject_IDs

def fetch_filenames(data_folder,subject_IDs, file_type):
    """
        subject_list : list of short subject IDs in string format
        file_type    : must be one of the available file types
    returns:
        filenames    : list of filetypes (same length as subject_list)
    """
    import glob
    # Specify file mappings for the possible file types
    filemapping = {'func_preproc': '_func_preproc.nii.gz',
                   'rois_aal': '_rois_aal.1D'}
    # The list to be filled
    filenames = []
    # Fill list with requested file paths
    for i in range(len(subject_IDs)):
        os.chdir(data_folder)
        try:
            filenames.append(glob.glob('*' + subject_IDs[i] + filemapping[file_type])[0])
        except IndexError:
            # Return N/A if subject ID is not found
            filenames.append('N/A')
    return filenames

def get_timeseries(data_folder,subject_list, atlas_name):
    """
        subject_list : list of short subject IDs in string format
        atlas_name   : the atlas based on which the timeseries are generated e.g. aal, cc200

    returns:
        time_series  : list of timeseries arrays, each of shape (timepoints x regions)
    """
    timeseries = []
    for i in range(len(subject_list)):
        subject_folder = os.path.join(data_folder, subject_list[i])
        ro_file = [f for f in os.listdir(subject_folder) if f.endswith('_rois_' + atlas_name + '.1D')]
        print(ro_file[0])
        fl = os.path.join(subject_folder, ro_file[0])
        print("Reading timeseries file %s" %fl)
        timeseries.append(np.loadtxt(fl, skiprows=0))
    return timeseries

def subject_connectivity(timeseries, subject, atlas_name, kind, save_path, save=True):
    """
        timeseries   : timeseries table for subject (timepoints x regions)
        subject      : the subject ID
        atlas_name   : name of the parcellation atlas used
        kind         : the kind of connectivity to be used, e.g. lasso, partial correlation, correlation
        save         : save the connectivity matrix to a file
        save_path    : specify path to save the matrix if different from subject folder

    returns:
        connectivity : connectivity matrix (regions x regions)
    """

    print("Estimating %s matrix for subject %s" % (kind, subject))

    if kind in ['tangent', 'partial correlation', 'correlation']:
        conn_measure = connectome.ConnectivityMeasure(kind=kind)
        connectivity = conn_measure.fit_transform([timeseries])[0]

    if save:
        subject_file = os.path.join(save_path, subject + '.mat')
        sio.savemat(subject_file, {'connectivity': connectivity})

    return connectivity



AAL_path = main_path + '/AAL/original/'
data_folder = AAL_path + 'ABIDE_pcp/cpac/filt_noglobal'
if not os.path.exists(AAL_path):
  os.makedirs(AAL_path)
pipeline = 'cpac'
num_subjects = 871  # Number of subjects
files = ['rois_aal']
filemapping = {'func_preproc': 'func_preproc.nii.gz',
               'rois_aal': 'rois_aal.1D'}

abide = datasets.fetch_abide_pcp(data_dir=AAL_path, n_subjects=num_subjects, pipeline=pipeline,
                 band_pass_filtering=True, global_signal_regression=False, derivatives=files)


subject_IDs = get_ids(phenotypic_path,num_subjects)
subject_IDs = subject_IDs.tolist()

for s, fname in zip(subject_IDs, fetch_filenames(data_folder, subject_IDs, files[0])):
    subject_folder = os.path.join(data_folder, s)
    if not os.path.exists(subject_folder):
        os.mkdir(subject_folder)

    # Get the base filename for each subject
    base = fname.split(files[0])[0]

    # Move each subject file to the subject folder
    for fl in files:
        if not os.path.exists(os.path.join(subject_folder, base + filemapping[fl])):
            shutil.move(base + filemapping[fl], subject_folder)

time_series = get_timeseries(data_folder,subject_IDs, 'aal')

# Compute and save connectivity matrices
for i in range(len(subject_IDs)):
    subject_connectivity(time_series[i], subject_IDs[i], 'aal', 'correlation',AAL_path)


Pitt_0050003_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50003/Pitt_0050003_rois_aal.1D
Pitt_0050004_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50004/Pitt_0050004_rois_aal.1D
Pitt_0050005_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50005/Pitt_0050005_rois_aal.1D
Pitt_0050006_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50006/Pitt_0050006_rois_aal.1D
Pitt_0050007_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50007/Pitt_0050007_rois_aal.1D
Pitt_0050008_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal/50008/Pitt_0050008_rois_aal.1D
Pitt_0050010_rois_aal.1D
Reading timeseries file /content/drive/MyDrive/ABIDE/AAL/original/ABI

In [None]:
################### download labels, collection sites, age, gender, FIQS .mat files #################################
import csv


phenotype = os.path.join(AAL_path, 'ABIDE_pcp/Phenotypic_V1_0b_preprocessed1.csv')

def get_subject_score(phenotype,subject_list, score):
    scores_dict = {}

    with open(phenotype) as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            if row['SUB_ID'] in subject_list:
                scores_dict[row['SUB_ID']] = row[score]

    return scores_dict

ages_dist = get_subject_score(phenotype, subject_IDs, score='AGE_AT_SCAN')
genders_dist = get_subject_score(phenotype, subject_IDs, score='SEX')
labels_dist = get_subject_score(phenotype, subject_IDs, score='DX_GROUP')
sites_dist = get_subject_score(phenotype, subject_IDs, score='SITE_ID')
FIQS_dist = get_subject_score(phenotype, subject_IDs, score='FIQ')
NUM_dist = get_subject_score(phenotype, subject_IDs, score='func_num_fd')
PEC_dist = get_subject_score(phenotype, subject_IDs, score='func_perc_fd')
RAT_dist = get_subject_score(phenotype, subject_IDs, score='qc_anat_rater_2')
ages = list(ages_dist.values())
genders = list(genders_dist.values())
labels = list(labels_dist.values())
sites = list(sites_dist.values())
FIQS_all = list(FIQS_dist.values())
NUM_all = list(NUM_dist.values())
PEC_all = list(PEC_dist.values())
RAT_all = list(RAT_dist.values())

labels_array = np.zeros((2,num_subjects))
NUM = np.zeros([num_subjects, 1], dtype=np.int64)
PEC = np.zeros([num_subjects, 1], dtype=np.float64)
RAT = np.zeros([num_subjects, 1], dtype=np.int64)
FIQS = np.zeros([num_subjects, 1], dtype=np.float64)


for i in range(num_subjects):

    NUM[i] = NUM_all[i]
    PEC[i] = PEC_all[i]

    if FIQS_all[i] == '':
        FIQS[i] = -9999
    else:
        FIQS[i] = FIQS_all[i]

    if labels[i] == '1':
        labels_array[0, i] = 1
    else:
        labels_array[1, i] = 1

    if RAT_all[i] == 'OK':
        RAT[i]=1
    elif RAT_all[i] == 'maybe':
        RAT[i]=2
    elif RAT_all[i] == 'fail':
        RAT[i]=3

sio.savemat(os.path.join(phenotypic_path, 'ages.mat'), {'ages': ages})
sio.savemat(os.path.join(phenotypic_path, 'genders.mat'), {'genders': genders})
sio.savemat(os.path.join(phenotypic_path, 'sites.mat'), {'sites': sites})
sio.savemat(os.path.join(phenotypic_path, 'FIQS.mat'), {'FIQS': FIQS})
sio.savemat(os.path.join(phenotypic_path, 'NUM.mat'), {'NUM': NUM})
sio.savemat(os.path.join(phenotypic_path, 'PEC.mat'), {'PEC': PEC})
sio.savemat(os.path.join(phenotypic_path, 'RAT.mat'), {'RAT': RAT})
sio.savemat(os.path.join(phenotypic_path, 'ABIDE_label_871.mat'), {'label': labels_array})


In [None]:
import pandas as pd

df = pd.DataFrame({
    'SUB_ID': subject_IDs,
    'AGE_AT_SCAN': ages,
    'SEX': genders,
    'DX_GROUP': labels,
    'SITE_ID': sites,
    'FIQ': FIQS.flatten(),
    'func_num_fd': NUM.flatten(),
    'func_perc_fd': PEC.flatten(),
    'qc_anat_rater_2': [r[0] for r in RAT.tolist()],
    'func_mean_fd': [0.0] * len(subject_IDs)  # placeholder if missing
})

df.to_csv('Phenotypic_V1_0b_reconstructed.csv', index=False)


In [None]:
labels_array = np.zeros((2,num_subjects))
!pwd
os.chdir("/content/drive/MyDrive/ABIDE/AAL/original/")
!pwd
!rm -rf ABIDE_pcp

/content/drive/MyDrive/ABIDE/AAL/original/ABIDE_pcp/cpac/filt_noglobal
/content/drive/MyDrive/ABIDE/AAL/original


In [None]:
# Prepare CC200 data

from nilearn import datasets
from nilearn import connectome
import scipy.io as sio
import numpy as np
import shutil

def get_ids(data_folder,num_subjects=None):
    """
    return:
        subject_IDs    : list of all subject IDs
    """
    subject_IDs = np.genfromtxt(os.path.join(data_folder, 'subject_IDs.txt'), dtype=str)
    if num_subjects is not None:
        subject_IDs = subject_IDs[:num_subjects]
    return subject_IDs

def fetch_filenames(data_folder,subject_IDs, file_type):
    """
        subject_list : list of short subject IDs in string format
        file_type    : must be one of the available file types
    returns:
        filenames    : list of filetypes (same length as subject_list)
    """
    import glob
    # Specify file mappings for the possible file types
    filemapping = {'func_preproc': '_func_preproc.nii.gz',
                   'rois_cc200': '_rois_cc200.1D'}
    # The list to be filled
    filenames = []
    # Fill list with requested file paths
    for i in range(len(subject_IDs)):
        os.chdir(data_folder)
        try:
            filenames.append(glob.glob('*' + subject_IDs[i] + filemapping[file_type])[0])
        except IndexError:
            # Return N/A if subject ID is not found
            filenames.append('N/A')
    return filenames

def get_timeseries(data_folder,subject_list, atlas_name):
    """
        subject_list : list of short subject IDs in string format
        atlas_name   : the atlas based on which the timeseries are generated e.g. aal, cc200

    returns:
        time_series  : list of timeseries arrays, each of shape (timepoints x regions)
    """
    timeseries = []
    for i in range(len(subject_list)):
        subject_folder = os.path.join(data_folder, subject_list[i])
        ro_file = [f for f in os.listdir(subject_folder) if f.endswith('_rois_' + atlas_name + '.1D')]
        print(ro_file[0])
        fl = os.path.join(subject_folder, ro_file[0])
        print("Reading timeseries file %s" %fl)
        timeseries.append(np.loadtxt(fl, skiprows=0))
    return timeseries

def subject_connectivity(timeseries, subject, atlas_name, kind, save_path, save=True):
    """
        timeseries   : timeseries table for subject (timepoints x regions)
        subject      : the subject ID
        atlas_name   : name of the parcellation atlas used
        kind         : the kind of connectivity to be used, e.g. lasso, partial correlation, correlation
        save         : save the connectivity matrix to a file
        save_path    : specify path to save the matrix if different from subject folder

    returns:
        connectivity : connectivity matrix (regions x regions)
    """

    print("Estimating %s matrix for subject %s" % (kind, subject))

    if kind in ['tangent', 'partial correlation', 'correlation']:
        conn_measure = connectome.ConnectivityMeasure(kind=kind)
        connectivity = conn_measure.fit_transform([timeseries])[0]

    if save:
        subject_file = os.path.join(save_path, subject + '.mat')
        sio.savemat(subject_file, {'connectivity': connectivity})

    return connectivity



CC200_path = main_path + '/CC200/original/'
data_folder = CC200_path + 'ABIDE_pcp/cpac/filt_noglobal'
if not os.path.exists(CC200_path):
  os.makedirs(CC200_path)
pipeline = 'cpac'
num_subjects = 871  # Number of subjects
files = ['rois_cc200']
filemapping = {'func_preproc': 'func_preproc.nii.gz',
               'rois_cc200': 'rois_cc200.1D'}

abide = datasets.fetch_abide_pcp(data_dir=CC200_path, n_subjects=num_subjects, pipeline=pipeline,
                 band_pass_filtering=True, global_signal_regression=False, derivatives=files)


subject_IDs = get_ids(phenotypic_path,num_subjects)
subject_IDs = subject_IDs.tolist()

for s, fname in zip(subject_IDs, fetch_filenames(data_folder, subject_IDs, files[0])):
    subject_folder = os.path.join(data_folder, s)
    if not os.path.exists(subject_folder):
        os.mkdir(subject_folder)

    # Get the base filename for each subject
    base = fname.split(files[0])[0]

    # Move each subject file to the subject folder
    for fl in files:
        if not os.path.exists(os.path.join(subject_folder, base + filemapping[fl])):
            shutil.move(base + filemapping[fl], subject_folder)

time_series = get_timeseries(data_folder,subject_IDs, 'cc200')

# Compute and save connectivity matrices
for i in range(len(subject_IDs)):
    subject_connectivity(time_series[i], subject_IDs[i], 'cc200', 'correlation',CC200_path)

KeyboardInterrupt: 

In [None]:

# Remove bad subjects and corresponding timeseries
valid_subjects = []
valid_time_series = []

for sid, ts in zip(subject_IDs, time_series):
    if isinstance(ts, np.ndarray) and ts.ndim == 2:
        valid_subjects.append(sid)
        valid_time_series.append(ts)
    else:
        print(f"❌ Skipping {sid}")


❌ Skipping 50038
❌ Skipping 50039
❌ Skipping 50040
❌ Skipping 50041
❌ Skipping 50042
❌ Skipping 50043
❌ Skipping 50044
❌ Skipping 50045
❌ Skipping 50046
❌ Skipping 50047
❌ Skipping 50048
❌ Skipping 50049
❌ Skipping 50050
❌ Skipping 50051
❌ Skipping 50052
❌ Skipping 50053
❌ Skipping 50054
❌ Skipping 50056
❌ Skipping 50057


In [None]:
bad_shapes = [i for i, ts in enumerate(time_series) if not isinstance(ts, np.ndarray) or ts.ndim != 2]
print(f"Invalid timeseries count: {len(bad_shapes)}")
print("Bad subjects:", [subject_IDs[i] for i in bad_shapes])


Invalid timeseries count: 19
Bad subjects: ['50038', '50039', '50040', '50041', '50042', '50043', '50044', '50045', '50046', '50047', '50048', '50049', '50050', '50051', '50052', '50053', '50054', '50056', '50057']


In [None]:

!pwd
os.chdir("/content/drive/MyDrive/ABIDE/CC200/original/")
!pwd
!rm -rf ABIDE_pcp

/content/drive/MyDrive/ABIDE/CC200/original/ABIDE_pcp/cpac/filt_noglobal
/content/drive/MyDrive/ABIDE/CC200/original


In [None]:
# download sMRI data
!gdown 1E18xuQ-BbspSuuFIhJ2AiI4RsPKwqTCy --output /content/drive/MyDrive/ABIDE/

Downloading...
From (original): https://drive.google.com/uc?id=1E18xuQ-BbspSuuFIhJ2AiI4RsPKwqTCy
From (redirected): https://drive.google.com/uc?id=1E18xuQ-BbspSuuFIhJ2AiI4RsPKwqTCy&confirm=t&uuid=af33b768-8cd9-4de5-8ec2-1969a3d79ddb
To: /content/drive/MyDrive/ABIDE/freesurfer_stats.zip
100% 39.6M/39.6M [00:00<00:00, 41.9MB/s]


In [None]:
# mkdir paths

save_model_path = main_path + '/save_models'
if not os.path.exists(save_model_path):
  os.makedirs(save_model_path)

foldernames = ['AAL','CC200','sMRI','CC200_sMRI']
modelnames = ['SVM','FCN','AUTO','GCN','EV_GCN']

for i in range(len(foldernames)):
  features_path = save_model_path + '/' + foldernames[i]
  with_phe_path = features_path + '/with_phe'
  without_phe_path = features_path + '/without_phe'
  with_phe_maxvoting_path = with_phe_path + '/Max_voting'
  without_phe_maxvoting_path = without_phe_path + '/Max_voting'
  if not os.path.exists(features_path):
    os.makedirs(features_path)
  if not os.path.exists(with_phe_path):
    os.makedirs(with_phe_path)
  if not os.path.exists(without_phe_path):
    os.makedirs(without_phe_path)
  if not os.path.exists(with_phe_maxvoting_path):
    os.makedirs(with_phe_maxvoting_path)
  if not os.path.exists(without_phe_maxvoting_path):
    os.makedirs(without_phe_maxvoting_path)

  for j in range(len(modelnames)):
    with_path = with_phe_maxvoting_path + '/' + modelnames[j]
    without_path = without_phe_maxvoting_path + '/' + modelnames[j]
    if not os.path.exists(with_path):
      os.makedirs(with_path)
    if not os.path.exists(without_path):
      os.makedirs(without_path)
