## Importing and Preprocessing data

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
import numpy as np

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings("ignore")


### Importing necessary files and cleaning the data

In [136]:
# Loading the phenotypic data using other columns also 

site = "Pittsburgh"
pheno_csv = site + "_phenotypic.csv"

phenotype_path = os.path.join("fMRI/ADHD200_CC200_TCs_filtfix", site, pheno_csv)
phenotype_data = pd.read_csv(phenotype_path)

phenotype_data["ScanDir ID"] = phenotype_data["ScanDir ID"].astype(str).str.zfill(7)

pheno_columns = ["ScanDir ID", "DX"]

phenotype_data = phenotype_data[pheno_columns]

# # Cleaning the Full4IQ ,it has missing values (-999)
# phenotype_data["Full4 IQ"] = phenotype_data["Full4 IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Full4 IQ"].mean(skipna=True)
# phenotype_data["Full4 IQ"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the ADHD Index ,it has missing values (-999)
# phenotype_data["ADHD Index"] = phenotype_data["ADHD Index"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["ADHD Index"].mean(skipna=True)
# phenotype_data["ADHD Index"].fillna(full4_iq_mean, inplace=True)

# Cleaning the Inattentive it has missing values (-999)
# phenotype_data["Inattentive"] = phenotype_data["Inattentive"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Inattentive"].mean(skipna=True)
# phenotype_data["Inattentive"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Hyper/Impulsive it has missing values (-999)
# phenotype_data["Hyper/Impulsive"] = phenotype_data["Hyper/Impulsive"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Hyper/Impulsive"].mean(skipna=True)
# phenotype_data["Hyper/Impulsive"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Verbal IQ as it has missing values (-999)
# phenotype_data["Verbal IQ"] = phenotype_data["Verbal IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Verbal IQ"].mean(skipna=True)
# phenotype_data["Verbal IQ"].fillna(full4_iq_mean, inplace=True)

# # Cleaning the Performance IQ as it has missing values (-999)
# phenotype_data["Performance IQ"] = phenotype_data["Performance IQ"].replace(-999, np.nan)
# full4_iq_mean = phenotype_data["Performance IQ"].mean(skipna=True)
# phenotype_data["Performance IQ"].fillna(full4_iq_mean, inplace=True)

# # Replace "No Medication" (previously -999) with 2 (Not on Medication)
# phenotype_data["Med Status"] = phenotype_data["Med Status"].replace(-999, 2)

# # Convert Med Status column to integer type
# phenotype_data["Med Status"] = phenotype_data["Med Status"].astype(int)

phenotype_data.head()

phenotype_data.isna().sum().sum()

Unnamed: 0,ScanDir ID,DX
0,16058,0
1,16046,0
2,16060,0
3,16048,0
4,16029,0


0

#### Reading the .1D files and converting into Functional Connectivity Matrix

In [137]:


# Base folder where all the fMRI data for all the subjects are stored
base_folder = "fMRI/ADHD200_CC200_TCs_filtfix/Pittsburgh/"

# List to store fMRI data to all subjects
fc_data_dict = {}


# Loop through each subject folders to read each data
for subject_id in sorted(os.listdir(base_folder)):
    subject_path = os.path.join(base_folder, subject_id)

    # Ensure that subject_path is a directory or folder not a file
    if os.path.isdir(subject_path):

        # Constructing file path for expected .1D filename
        fmri_file_path = os.path.join(subject_path, f"sfnwmrda{subject_id}_session_1_rest_1_cc200_TCs.1D")

        # if the frmi_file_path exists:
        if os.path.exists(fmri_file_path):

            # Read the fMRI .1D files 
            fmri_data = pd.read_csv(fmri_file_path, delim_whitespace = True, 
                                    header = None, skiprows = 1)
            # Removing the first 2 columns are those are not necessary
            fmri_data = fmri_data.iloc[:, 2:]
            # Converting all values to float
            fmri_data = fmri_data.astype(float)
            # Convert the loaded dataframe to array for correlation calculation
            fmri_data_array = fmri_data.to_numpy()

            # Compute the Functional Connectivity Matrix by columns which are the brain atlas or brain region
            fc_matrix = np.corrcoef(fmri_data_array, rowvar = False)

            
            # Store the FC matrix with subject ID
            fc_data_dict[subject_id] = fc_matrix


# Check if data loaded correctly
print(f"Total subjects processed: {len(fc_data_dict)}")
print(f"Example FC Matrix Shape: {list(fc_data_dict.values())[0].shape}")  # Check first subject

Total subjects processed: 89
Example FC Matrix Shape: (190, 190)


In [119]:
fc_data_dict

{'1056121': array([[ 1.        ,  0.00287646, -0.12377184, ...,  0.17905459,
          0.16025789, -0.04817031],
        [ 0.00287646,  1.        ,  0.06673373, ...,  0.71563313,
          0.2317126 , -0.19534362],
        [-0.12377184,  0.06673373,  1.        , ..., -0.26027228,
          0.13358731,  0.57158703],
        ...,
        [ 0.17905459,  0.71563313, -0.26027228, ...,  1.        ,
          0.38138422, -0.47633652],
        [ 0.16025789,  0.2317126 ,  0.13358731, ...,  0.38138422,
          1.        , -0.0884433 ],
        [-0.04817031, -0.19534362,  0.57158703, ..., -0.47633652,
         -0.0884433 ,  1.        ]]),
 '1113498': array([[ 1.        ,  0.0952132 ,  0.04759913, ...,  0.22489836,
          0.15854925, -0.05637741],
        [ 0.0952132 ,  1.        ,  0.04341985, ...,  0.53837166,
          0.56651326, -0.32804684],
        [ 0.04759913,  0.04341985,  1.        , ..., -0.42080864,
         -0.01126368,  0.59702479],
        ...,
        [ 0.22489836,  0.5383716

### Flattening the FC matrix

In [138]:
from sklearn.decomposition import PCA

flat_fmri_fc_vector = []
subject_ids = list(fc_data_dict.keys())

for subject_id, fc_matrix in fc_data_dict.items(): # setting the loop to iterate in the fc matrix dictionary
    
    # Numpy function 'triu_indices_from' gets the values only from the upper triangle of the matrix where the diagonal, k = 1, so everything from above the diagonal (1)
    upper_triangle = fc_matrix[np.triu_indices_from(fc_matrix, k = 1)]
    flat_fmri_fc_vector.append(upper_triangle)

# Convert the flat_fmri_fc_vector to dataframe with subject id as index
fc_feature_df = pd.DataFrame(flat_fmri_fc_vector, index = subject_ids)
## Now each subject has 17954 FC features 
fc_feature_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17945,17946,17947,17948,17949,17950,17951,17952,17953,17954
0016001,-0.271380,-0.229712,-0.159428,-0.015803,-0.007047,0.438837,-0.295184,-0.284776,0.173853,0.257705,...,0.122539,0.007074,0.357883,-0.287511,0.112575,-0.091203,0.242224,0.167068,0.172556,0.066265
0016002,0.102775,0.036137,-0.039993,-0.147257,-0.494666,0.102741,-0.003982,-0.400279,0.197002,0.492709,...,0.144761,-0.086066,-0.090605,0.007019,0.406925,0.588005,-0.024898,0.578380,-0.124206,0.022494
0016003,0.115221,-0.105213,-0.225292,0.105765,0.583594,0.308611,0.249311,-0.499982,0.018582,0.088514,...,-0.173453,0.107741,0.121655,-0.068339,-0.290526,0.304405,0.370959,-0.038797,-0.113554,0.278835
0016004,0.066628,-0.052701,-0.048548,-0.220683,0.099986,0.258407,-0.164431,-0.107788,-0.083767,0.405188,...,-0.573151,0.062314,-0.062325,-0.676332,0.093098,-0.033308,0.519854,0.410201,-0.338034,-0.063752
0016005,-0.105104,-0.077175,-0.398102,0.126833,-0.101346,0.552562,0.038706,-0.203850,0.000545,0.079449,...,0.041093,-0.238271,-0.232460,0.312837,0.119412,0.107547,0.205567,0.210789,-0.001464,0.136800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0016085,0.256857,-0.263169,-0.131365,0.027932,-0.014822,0.014771,0.056938,0.143062,-0.010371,-0.127087,...,0.382815,0.361586,0.330860,-0.423798,-0.199007,0.186894,0.100700,0.259376,-0.844362,-0.166029
0016086,-0.065779,-0.238404,-0.044128,-0.063448,0.210347,0.104097,-0.210507,-0.016304,-0.135253,0.047660,...,0.088123,0.360265,-0.481756,-0.242735,0.276279,-0.164117,-0.029655,-0.072654,-0.121360,0.049881
0016087,-0.072115,0.206860,0.039156,0.013089,0.416107,-0.069638,0.175367,-0.458874,-0.023596,0.269694,...,0.177148,0.178696,-0.496740,-0.008995,0.214969,-0.049876,-0.141787,-0.017702,-0.412796,-0.301296
0016088,-0.287742,0.281000,-0.165171,0.542147,0.275327,0.065973,0.142446,-0.270427,0.296728,0.069016,...,-0.088667,0.005470,-0.478561,-0.179401,-0.191569,0.050178,0.125581,0.517929,-0.433515,-0.277586


In [139]:
# Convert FC feature DataFrame index to a column for merging
fc_feature_df.reset_index(inplace=True)
fc_feature_df.rename(columns={'index': 'ScanDir ID'}, inplace=True)

# Merge the FC feature data with the cleaned phenotypic data
merged_FC = phenotype_data.merge(fc_feature_df, on="ScanDir ID", how="inner")

merged_FC


Unnamed: 0,ScanDir ID,DX,0,1,2,3,4,5,6,7,...,17945,17946,17947,17948,17949,17950,17951,17952,17953,17954
0,0016058,0,-0.086884,-0.079423,-0.270457,-0.189188,0.057157,0.194664,-0.222744,-0.239224,...,0.058549,-0.074088,0.061168,-0.016989,0.113791,0.080253,-0.011341,0.281022,-0.467041,-0.059684
1,0016046,0,0.253228,0.467968,0.063277,-0.089031,-0.010471,-0.192401,-0.080552,-0.089233,...,-0.137583,-0.129354,-0.015914,-0.277484,-0.288646,-0.428793,0.022420,0.419929,-0.132033,-0.028046
2,0016060,0,-0.190415,-0.225667,-0.152934,0.398291,-0.242932,0.556966,-0.074823,-0.529901,...,0.028201,0.185488,0.141302,-0.305005,0.342502,-0.252229,0.061328,0.182419,-0.426385,-0.001272
3,0016048,0,-0.394063,-0.536707,0.252266,0.132145,0.563672,0.347688,-0.175214,0.120545,...,-0.449084,-0.058262,-0.383245,-0.570550,0.350058,0.146359,0.161988,0.246260,-0.348514,0.337946
4,0016029,0,-0.012174,-0.092407,0.123172,-0.140781,-0.131120,0.517431,0.162108,-0.201654,...,0.383495,-0.023956,-0.259123,-0.094618,0.396415,0.076773,-0.194761,0.584967,-0.107169,0.246905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0016015,0,0.200471,-0.289968,-0.355810,0.174296,0.289214,0.517340,-0.096766,-0.750230,...,-0.611890,-0.187230,0.534160,-0.127970,0.243588,-0.381667,0.449399,0.126014,0.200945,0.253149
85,0016035,0,0.132492,0.326378,-0.353937,0.044580,0.223150,0.366049,0.150565,-0.380507,...,0.063877,-0.212205,-0.098665,-0.211867,-0.079297,-0.180064,0.249533,0.562484,-0.263476,-0.193807
86,0016023,0,0.215981,-0.123490,-0.566731,-0.521121,0.354481,-0.086376,-0.396509,0.045017,...,-0.343868,-0.051493,0.073463,-0.271316,0.277479,-0.550973,0.011630,0.181785,-0.052048,0.104093
87,0016054,0,-0.146560,0.112630,-0.656629,-0.002024,0.033685,0.396811,-0.331365,-0.575389,...,-0.109509,-0.222417,0.231068,0.140239,0.357375,0.516564,0.774589,0.337606,0.220451,0.604926


In [140]:
merged_FC.isna().sum().sum()
merged_FC

377

Unnamed: 0,ScanDir ID,DX,0,1,2,3,4,5,6,7,...,17945,17946,17947,17948,17949,17950,17951,17952,17953,17954
0,0016058,0,-0.086884,-0.079423,-0.270457,-0.189188,0.057157,0.194664,-0.222744,-0.239224,...,0.058549,-0.074088,0.061168,-0.016989,0.113791,0.080253,-0.011341,0.281022,-0.467041,-0.059684
1,0016046,0,0.253228,0.467968,0.063277,-0.089031,-0.010471,-0.192401,-0.080552,-0.089233,...,-0.137583,-0.129354,-0.015914,-0.277484,-0.288646,-0.428793,0.022420,0.419929,-0.132033,-0.028046
2,0016060,0,-0.190415,-0.225667,-0.152934,0.398291,-0.242932,0.556966,-0.074823,-0.529901,...,0.028201,0.185488,0.141302,-0.305005,0.342502,-0.252229,0.061328,0.182419,-0.426385,-0.001272
3,0016048,0,-0.394063,-0.536707,0.252266,0.132145,0.563672,0.347688,-0.175214,0.120545,...,-0.449084,-0.058262,-0.383245,-0.570550,0.350058,0.146359,0.161988,0.246260,-0.348514,0.337946
4,0016029,0,-0.012174,-0.092407,0.123172,-0.140781,-0.131120,0.517431,0.162108,-0.201654,...,0.383495,-0.023956,-0.259123,-0.094618,0.396415,0.076773,-0.194761,0.584967,-0.107169,0.246905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0016015,0,0.200471,-0.289968,-0.355810,0.174296,0.289214,0.517340,-0.096766,-0.750230,...,-0.611890,-0.187230,0.534160,-0.127970,0.243588,-0.381667,0.449399,0.126014,0.200945,0.253149
85,0016035,0,0.132492,0.326378,-0.353937,0.044580,0.223150,0.366049,0.150565,-0.380507,...,0.063877,-0.212205,-0.098665,-0.211867,-0.079297,-0.180064,0.249533,0.562484,-0.263476,-0.193807
86,0016023,0,0.215981,-0.123490,-0.566731,-0.521121,0.354481,-0.086376,-0.396509,0.045017,...,-0.343868,-0.051493,0.073463,-0.271316,0.277479,-0.550973,0.011630,0.181785,-0.052048,0.104093
87,0016054,0,-0.146560,0.112630,-0.656629,-0.002024,0.033685,0.396811,-0.331365,-0.575389,...,-0.109509,-0.222417,0.231068,0.140239,0.357375,0.516564,0.774589,0.337606,0.220451,0.604926


In [135]:
merged_FC.to_csv("Peking_3_FC.csv", index = True)