In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
import numpy as np

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings("ignore")


In [53]:
# Loading the phenotypic data using other columns also 

site = "OHSU"
pheno_csv = site + "_phenotypic.csv"

phenotype_path = os.path.join("fMRI/ADHD200_CC200_TCs_filtfix", site, pheno_csv)
phenotype_data = pd.read_csv(phenotype_path)

phenotype_data["ScanDir ID"] = phenotype_data["ScanDir ID"].astype(str).str.zfill(7)

pheno_columns = ["ScanDir ID", "DX"]

phenotype_data = phenotype_data[pheno_columns]

# # Cleaning the Full4IQ ,it has missing values (-999)
# phenotype_data["Full4 IQ"] = phenotype_data["Full4 IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Full4 IQ"].mean(skipna=True)
# phenotype_data["Full4 IQ"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the ADHD Index ,it has missing values (-999)
# phenotype_data["ADHD Index"] = phenotype_data["ADHD Index"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["ADHD Index"].mean(skipna=True)
# phenotype_data["ADHD Index"].fillna(full4_iq_mean, inplace=True)

# Cleaning the Inattentive it has missing values (-999)
# phenotype_data["Inattentive"] = phenotype_data["Inattentive"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Inattentive"].mean(skipna=True)
# phenotype_data["Inattentive"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Hyper/Impulsive it has missing values (-999)
# phenotype_data["Hyper/Impulsive"] = phenotype_data["Hyper/Impulsive"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Hyper/Impulsive"].mean(skipna=True)
# phenotype_data["Hyper/Impulsive"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Verbal IQ as it has missing values (-999)
# phenotype_data["Verbal IQ"] = phenotype_data["Verbal IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Verbal IQ"].mean(skipna=True)
# phenotype_data["Verbal IQ"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Performance IQ as it has missing values (-999)
# phenotype_data["Performance IQ"] = phenotype_data["Performance IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Performance IQ"].mean(skipna=True)
# phenotype_data["Performance IQ"].fillna(full4_iq_mean, inplace=True)

# # Replace "No Medication" (previously -999) with 2 (Not on Medication)
# phenotype_data["Med Status"] = phenotype_data["Med Status"].replace(-999, 2)

# # Convert Med Status column to integer type
# phenotype_data["Med Status"] = phenotype_data["Med Status"].astype(int)

phenotype_data.head()

phenotype_data.isna().sum().sum()

Unnamed: 0,ScanDir ID,DX
0,1084283,1
1,1084884,0
2,1108916,1
3,1206380,3
4,1340333,1


0

In [54]:
# Base folder where all the fMRI data for all the subjects are stored
base_folder = "fMRI/ADHD200_CC200_TCs_filtfix/OHSU/"

# Dictionary to store fMRI FC matrices
fc_data_dict = {}

# Loop through each subject folder
for subject_id in sorted(os.listdir(base_folder)):
    subject_path = os.path.join(base_folder, subject_id)

    if os.path.isdir(subject_path):
        # Define the expected file names for Rest 1 & Rest 2
        rest_1_file = f"sfnwmrda{subject_id}_session_1_rest_1_cc200_TCs.1D"
        rest_2_file = f"sfnwmrda{subject_id}_session_1_rest_2_cc200_TCs.1D"
        rest_3_file = f"sfnwmrda{subject_id}_session_1_rest_3_cc200_TCs.1D"
        
        
        rest_1_path = os.path.join(subject_path, rest_1_file)
        rest_2_path = os.path.join(subject_path, rest_2_file)
        rest_3_path = os.path.join(subject_path, rest_3_file)

        merged_data = []  # Store the time-series data

        # Read Rest 1 if it exists
        if os.path.exists(rest_1_path):
            fmri_data_1 = pd.read_csv(rest_1_path, delim_whitespace=True, header=None, skiprows=1)
            fmri_data_1 = fmri_data_1.iloc[:, 2:].astype(float).to_numpy()
            merged_data.append(fmri_data_1)

        # Read Rest 2 if it exists
        if os.path.exists(rest_2_path):
            fmri_data_2 = pd.read_csv(rest_2_path, delim_whitespace=True, header=None, skiprows=1)
            fmri_data_2 = fmri_data_2.iloc[:, 2:].astype(float).to_numpy()
            merged_data.append(fmri_data_2)

        # Read Rest 3 if it exists
        if os.path.exists(rest_3_path):
            fmri_data_3 = pd.read_csv(rest_3_path, delim_whitespace=True, header=None, skiprows=1)
            fmri_data_3 = fmri_data_3.iloc[:, 2:].astype(float).to_numpy()
            merged_data.append(fmri_data_3)

        if merged_data:
            # Stack Rest 1 & Rest 2 data (timepoints combined)
            merged_time_series = np.vstack(merged_data)

            # Compute Functional Connectivity (FC) matrix
            fc_matrix = np.corrcoef(merged_time_series, rowvar=False)

            # Store the FC matrix with subject ID
            fc_data_dict[subject_id] = fc_matrix

# Check if data loaded correctly
print(f"Total subjects processed: {len(fc_data_dict)}")
print(f"Example FC Matrix Shape: {list(fc_data_dict.values())[0].shape}")  # Check first subject


Total subjects processed: 79
Example FC Matrix Shape: (190, 190)


In [55]:
from sklearn.decomposition import PCA

flat_fmri_fc_vector = []
subject_ids = list(fc_data_dict.keys())

for subject_id, fc_matrix in fc_data_dict.items(): # setting the loop to iterate in the fc matrix dictionary
    
    # Numpy function 'triu_indices_from' gets the values only from the upper triangle of the matrix where the diagonal, k = 1, so everything from above the diagonal (1)
    upper_triangle = fc_matrix[np.triu_indices_from(fc_matrix, k = 1)]
    flat_fmri_fc_vector.append(upper_triangle)

# Convert the flat_fmri_fc_vector to dataframe with subject id as index
fc_feature_df = pd.DataFrame(flat_fmri_fc_vector, index = subject_ids)
## Now each subject has 17954 FC features 
fc_feature_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17945,17946,17947,17948,17949,17950,17951,17952,17953,17954
1084283,0.341092,0.268884,-0.290252,0.383650,0.615438,0.293986,0.239813,-0.108189,-0.010355,0.232697,...,0.188755,0.099702,0.077119,-0.142898,0.590536,0.232785,-0.084554,0.210098,-0.074171,-0.317937
1084884,0.228740,0.061611,0.069351,0.064131,0.216411,0.503539,0.264585,-0.053375,-0.006811,-0.078176,...,0.135871,-0.016520,-0.027005,-0.054042,0.361930,0.228253,-0.241722,0.569298,-0.374399,-0.494692
1108916,0.072884,-0.106039,0.103567,0.102353,0.414985,0.224137,0.118194,0.169617,-0.031277,0.276406,...,-0.201062,-0.441171,-0.000463,-0.126529,0.266353,0.288772,0.384909,-0.260658,0.127295,0.220257
1206380,0.098052,-0.194208,-0.238640,-0.019649,0.257082,0.510956,0.020324,-0.116368,0.125104,0.375901,...,-0.280965,0.067343,-0.214139,-0.391294,0.302516,-0.157779,-0.069655,-0.242535,-0.170920,0.022130
1340333,-0.301657,-0.025613,-0.019527,0.255277,0.517001,0.204825,0.134649,-0.173590,-0.163768,0.401576,...,0.099017,-0.168565,-0.066549,-0.269201,0.111330,0.185296,0.062994,0.586618,0.357226,0.027452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7333005,-0.235531,-0.447666,0.264827,0.090926,0.343994,0.539620,0.113118,-0.124002,-0.217284,0.650775,...,-0.274396,-0.013863,-0.225283,-0.340900,-0.286652,-0.199520,-0.086667,0.155519,-0.175030,0.408474
8064456,0.113598,-0.522540,-0.067078,-0.296668,0.616401,0.675200,-0.190205,-0.410642,-0.295001,0.663566,...,-0.155357,0.055121,-0.330598,0.067680,-0.167212,-0.426618,-0.444294,-0.391214,-0.112769,0.548685
8218392,0.273383,-0.222953,0.073789,-0.110592,-0.075583,0.504437,-0.039752,0.041270,0.015576,0.016216,...,0.245281,-0.344099,-0.267408,-0.110248,0.050609,-0.115607,-0.299394,0.655694,0.274929,0.511874
8720244,-0.070568,-0.301601,0.069675,-0.089429,-0.004588,0.626955,0.108315,-0.065251,-0.293787,0.603951,...,-0.167860,0.070659,-0.136792,-0.012118,-0.076633,-0.485414,-0.412479,0.032433,-0.192588,0.661537


In [56]:
# Convert FC feature DataFrame index to a column for merging
fc_feature_df.reset_index(inplace=True)
fc_feature_df.rename(columns={'index': 'ScanDir ID'}, inplace=True)

# Merge the FC feature data with the cleaned phenotypic data
merged_FC = phenotype_data.merge(fc_feature_df, on="ScanDir ID", how="inner")

merged_FC



Unnamed: 0,ScanDir ID,DX,0,1,2,3,4,5,6,7,...,17945,17946,17947,17948,17949,17950,17951,17952,17953,17954
0,1084283,1,0.341092,0.268884,-0.290252,0.383650,0.615438,0.293986,0.239813,-0.108189,...,0.188755,0.099702,0.077119,-0.142898,0.590536,0.232785,-0.084554,0.210098,-0.074171,-0.317937
1,1084884,0,0.228740,0.061611,0.069351,0.064131,0.216411,0.503539,0.264585,-0.053375,...,0.135871,-0.016520,-0.027005,-0.054042,0.361930,0.228253,-0.241722,0.569298,-0.374399,-0.494692
2,1108916,1,0.072884,-0.106039,0.103567,0.102353,0.414985,0.224137,0.118194,0.169617,...,-0.201062,-0.441171,-0.000463,-0.126529,0.266353,0.288772,0.384909,-0.260658,0.127295,0.220257
3,1206380,3,0.098052,-0.194208,-0.238640,-0.019649,0.257082,0.510956,0.020324,-0.116368,...,-0.280965,0.067343,-0.214139,-0.391294,0.302516,-0.157779,-0.069655,-0.242535,-0.170920,0.022130
4,1340333,1,-0.301657,-0.025613,-0.019527,0.255277,0.517001,0.204825,0.134649,-0.173590,...,0.099017,-0.168565,-0.066549,-0.269201,0.111330,0.185296,0.062994,0.586618,0.357226,0.027452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,7333005,1,-0.235531,-0.447666,0.264827,0.090926,0.343994,0.539620,0.113118,-0.124002,...,-0.274396,-0.013863,-0.225283,-0.340900,-0.286652,-0.199520,-0.086667,0.155519,-0.175030,0.408474
75,8064456,0,0.113598,-0.522540,-0.067078,-0.296668,0.616401,0.675200,-0.190205,-0.410642,...,-0.155357,0.055121,-0.330598,0.067680,-0.167212,-0.426618,-0.444294,-0.391214,-0.112769,0.548685
76,8218392,0,0.273383,-0.222953,0.073789,-0.110592,-0.075583,0.504437,-0.039752,0.041270,...,0.245281,-0.344099,-0.267408,-0.110248,0.050609,-0.115607,-0.299394,0.655694,0.274929,0.511874
77,8720244,3,-0.070568,-0.301601,0.069675,-0.089429,-0.004588,0.626955,0.108315,-0.065251,...,-0.167860,0.070659,-0.136792,-0.012118,-0.076633,-0.485414,-0.412479,0.032433,-0.192588,0.661537


In [57]:
merged_FC.isna().sum().sum()

0

In [58]:
merged_FC.to_csv("OHSU_FC.csv", index = True)