In [1]:
import numpy as np
import os, csv, time

# Functions for data reading

In [2]:
#path specifies the root of the folder structure in which the csv files should be
# path = r"C:\Users\s169940\Downloads\ABIDEgithubtest\ABIDE"#\ABIDE" #path to the folder in which 'ABIDE_I' and 'ABIDE_II' are located
path = r"C:\Users\s169940\OneDrive - TU Eindhoven\Documents\PhD\data\ABIDE\timeSeries\ABIDE"
pathFD = r"C:\Users\s169940\Downloads\ABIDEgithubtest\supportingFiles\supportingFiles\FD.csv" #location of FD.csv file

In [3]:
def list_depth(lst):
    #Finds the depth of a list
    if isinstance(lst, list):
        if not lst: return 1
        else: return 1 + max(list_depth(i) for i in lst)
    else: return 0

In [4]:
#This way of data loading only works if the csv files are placed in a folder named the site (and sample) it is from.
#The sites should be placed in a folder corresponding to their ABIDE version.
#The motivation behind this is that it matches the file structure as it was downloaded from ABIDE. 

def obtainCSVs(path):
    #recursively iterate through nested folders to find csv files
    #as the function returns in the loop, some files should be skipped to avoid breaking the loop
    csvFiles = []
    for item in os.listdir(path):
        itemPath = os.path.join(path, item)
        if os.path.isdir(itemPath):
            csvFiles.append(obtainCSVs(itemPath))
        elif '.csv' in itemPath and 'FD.csv' not in itemPath: 
            csvFiles.append(itemPath)
            return itemPath
    return csvFiles

def selectSites(csvsA1, csvsA2):
    # Apply whitelist. 
    # This function will not do anything if only the already whitelisted files are present in the folder.
    # This was made based on the whole ABIDE dataset, were filtering was necessary according to TR.
    whitelistedA1 = ["Carnegie_Mellon_University", "NYU_Langone_Medical_Center",
                     "San_Diego_State_University", "Stanford_University", 
                     "Trinity_Centre_for_Health_Sciences", "University_of_Michigan_Sample_1",
                     "University_of_Michigan_Sample_2", "Yale_Child_Study_Center"]
    whitelistedA2 = ["Erasmus_University_Medical_Center_Rotterdam", "Georgetown_University",
                     "NYU_Langone_Medical_Center_Sample_1", "NYU_Langone_Medical_Center_Sample_2",
                     "San_Diego_State_University", "Stanford_University", "Trinity_Centre_for_Health_Sciences",
                     "University_of_California_Davis", "University_of_Miami",
                     "University_of_Utah_School_of_Medicine"]
    
    whitelistedPathsA1 = []
    for site in whitelistedA1:
        whitelistedPathsA1 = whitelistedPathsA1 + [p for p in csvsA1 if site in p]

    whitelistedPathsA2 = []
    for site in whitelistedA2:
        whitelistedPathsA2 = whitelistedPathsA2 + [p for p in csvsA2 if site in p]
    return whitelistedPathsA1, whitelistedPathsA2


def isColumnEmpty(table, i):
    # assumes the first row contains column names
    tablenp = np.array(table)
    check = all(tablenp[1:, i] == '')
    return check

def makeTable(path):
    #read the csv file and make a table
    table = []
    with open(path, encoding='ISO-8859-1') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if len(row) == 1:
                row = row[0]
                row = row.split(',')
            table.append(row)
    nonEmptyFields = ([i for i in range(len(table[1])) if not isColumnEmpty(table, i)])
    tab = np.array(table)[:, nonEmptyFields] # take out empty fields
    version = ""
    if "ABIDE_II" in path: 
        identifier = path[path.find("ABIDEII_")+len("ABIDEII_"):path.find(".csv")]
        version = "ABIDE_II"
    elif "ABIDE_I" in path: 
        identifier = path[::-1][path[::-1].find('.')+1:path[::-1].find('cipytonehp')-1][::-1]
        version = "ABIDE_I"
    identifier = identifier if version == "" else version + ", " + identifier
    return tab, identifier

def findColumn(tab, title):
    #returns the index of the column with the title title in table tab
    if title in tab[0]: #row 0 is the title row
        colInd = np.where(tab[0] == title)[0]
        return int(colInd.item())
    else:
        print(tab[1][0], "does not contain field ", title)
        return np.nan
def obtainColumn(tab, colInd):
    #generates a list specifying a column
    return [row[colInd] for row in tab]

def EmptyField(column):
    #checks if the column is filled with entries specifying that it is empty
    notspecified = ['', '-9999']
    filledOut = [item for item in column[1:] if item not in notspecified]
    if len(filledOut) == 0: return True
    else: return False

# Data loading

In [5]:
start = time.time()
obtainedCSVs = obtainCSVs(path) #obtain csv file paths
if list_depth(obtainedCSVs) == 3: obtainedCSVs = obtainedCSVs[0] #it can happen that the 'ABIDE' directory is nested in itself   
csvsA1, csvsA2 = obtainedCSVs #if the depth is other than 2, it will cause issues here
csvsA1, csvsA2 = selectSites(csvsA1, csvsA2) #select whitelisted 
end = time.time() - start
print("Elapsed time [s]: ", end)

Elapsed time [s]:  0.07402443885803223


# Exclusions

In [6]:
# IDs to be excluded of CMU_B because of different TR
IDs = [50643, 50644, 50645, 50648, 50650, 50651, 50652, 50655, 50657, 50658, 50661, 50667, 50669] 
excludeIDs = [28861, 28871, 28897, 29495, 29875, 29883, 29885, 30000, 50207,
              50286, 50287, 50292, 50299, 50305, 50307, 50317, 50325, 50404,
              50647, 50650, 50959, 51163, 51170, 51174] #excluded based on psychoactive medication
excludeMotion = True
if excludeMotion: #mean FD > 0.5 mm
    excludedMotion = [50952,50185,50192,51161,51166,51195,50242,51136,50279,50281,
    50296,50303,50304,50306,50308,50309,50311,50313,50323,50354,50359,
    50376,50383,50615,50618,29873,29878,29880,29886,29887,29888,29889,29890,29893,
    29894,29897,29900,29903,29910,29914,29917,28756,28773,28777,28781,
    28784,28799,28812,28818,28819,28823,28831,28832,28834,28839,28840,
    30177,29097,29098,29100,29102,29110,29126,29134,29999,30240,30241,
    29503,29506,29510,29514]
    
    excludeIDs = excludeIDs + excludedMotion

newExclusions = [50296, 50303, 50308, 50653, 29880, 29887, 29888, 29151,
    29152, 29153, 29155, 29156, 29158, 29161, 29167, 29168, 29169, 29171,
    29172, 29174, 29175, 29176, 28901, 51167, 51176] #based on visual inspections (i.e. distortions and artifacts, but some were also excluded based on motion)

alignmentExclusions = [50642, 50646, 50656, 50665, 50666, 50572, 50603, 50605, 50561]; #not properly normalized to reference space
IQexclusions = [50606, 50626]; #PIQ, VIQ, and FIQ under 70
excludeIDs = np.unique(excludeIDs + IDs + newExclusions + IQexclusions + alignmentExclusions)

# Create tables (list of lists)

In [7]:
tabsA1 = [None] * len(csvsA1)
identA1 = [None] * len(csvsA1)
for i in range(len(csvsA1)):
    tabsA1[i], identA1[i] = makeTable(csvsA1[i]) #read csv file into table A1
    
tabsA2 = [None] * len(csvsA2)
identA2 = [None] * len(csvsA2)

for i in range(len(csvsA2)):
    tabsA2[i], identA2[i] = makeTable(csvsA2[i]) #read csv file into table A2

exclusions = []
for tabind in range(len(tabsA1)): #take out excluded participants A1
    tab = tabsA1[tabind]
    subID = findColumn(tab, 'SUB_ID')
    IDcol = obtainColumn(tab, subID)
    for i in range(len(IDcol)-1, 0, -1):
        if int(IDcol[i]) in excludeIDs:
            exclusions.append(int(IDcol[i]))
            tab = [tab[j] for j in range(len(tab)) if j!=i]
            tabsA1[tabind] = tab

for tabind in range(len(tabsA2)):  #take out excluded participants A2
    tab = tabsA2[tabind]
    subID = findColumn(tab, 'SUB_ID')
    IDcol = obtainColumn(tab, subID)
    for i in range(len(IDcol)-1, 0, -1):
        if int(IDcol[i]) in excludeIDs:
            exclusions.append(int(IDcol[i]))
            tab = [tab[j] for j in range(len(tab)) if j!=i]
            tabsA2[tabind] = tab 
            
for tab in tabsA1: #check if excluded IDs still present in tabsA1
    subID_col = obtainColumn(tab, findColumn(tab, 'SUB_ID'))
    for item in subID_col[1:]:
        if int(item) in excludeIDs:
            print("item:", item)
            
for tab in tabsA2: #check if excluded IDs still present in tabsA2
    subID_col = obtainColumn(tab, findColumn(tab, 'SUB_ID'))
    for item in subID_col[1:]:
        if int(item) in excludeIDs:
            print("item:", item)

# Check for empty columns 

In [8]:
for tabind in range(len(tabsA1)): #take out headers for columns with all empty entries
    headers = tabsA1[tabind][0]
    for i in range(len(headers))[::-1]: #reverse the range to avoid the deleted column causing indexing issues
        col = obtainColumn(tabsA1[tabind], i)
        if EmptyField(col):
            tabsA1[tabind] = [np.delete(row, i) for row in tabsA1[tabind]]

for tabind in range(len(tabsA2)): #take out headers for columns with all empty entries
    headers = tabsA2[tabind][0]
    for i in range(len(headers))[::-1]: #reverse the range to avoid the deleted column causing indexing issues
        col = obtainColumn(tabsA2[tabind], i)
        if EmptyField(col):
            tabsA2[tabind] = [np.delete(row, i) for row in tabsA2[tabind]]
            
for tab in tabsA1: #check if all empty columns are out
    headers = tab[0]
    for i in range(len(headers)):
        col = obtainColumn(tab, i)
        if EmptyField(col):
            print(tab[1][0])
            print(headers[i])
            print(i, len(headers))
            
for tab in tabsA2: #check if all empty columns are out
    headers = tab[0]
    for i in range(len(headers)):
        col = obtainColumn(tab, i)
        if EmptyField(col):
            print(tab[1][0])
            print(headers[i])
            print(i, len(headers))      

# Class with phenotypic information of participants

In [9]:
class participant:
    def __init__(self, dictIn):
        # it is possible to extent or change these
        self.age = dictIn["AGE_AT_SCAN"]
        self.age = dictIn["AGE_AT_SCAN "] if self.age == None else self.age #sometimes there is a space in the column title
        if self.age is not None: self.age = float(self.age)
        self.site = dictIn["SITE_ID"]
        self.partnum = dictIn["SUB_ID"]
        self.label = dictIn["DX_GROUP"]
        self.diagnosis = "ASD" if self.label == 1 else "HC"
        self.sex = dictIn["SEX"]
        self.PIQ = float(dictIn["PIQ"]) if dictIn["PIQ"] is not None else dictIn["PIQ"] 
        self.FIQ = float(dictIn["FIQ"]) if dictIn["FIQ"] is not None else dictIn["FIQ"]
        self.VIQ = float(dictIn["VIQ"]) if dictIn["VIQ"] is not None else dictIn["VIQ"]
        self.IQtypes = dict(zip(["FIQ", "VIQ", "PIQ"], [dictIn["FIQ_TEST_TYPE"], dictIn["VIQ_TEST_TYPE"], dictIn["PIQ_TEST_TYPE"]]))
        ADOSfields = ['ADOS_MODULE', 'ADOS_RSRCH_RELIABLE', 'ADOS_G_TOTAL', 'ADOS_TOTAL', 'ADOS_G_COMM',
                      'ADOS_COMM', 'ADOS_G_SOCIAL','ADOS_SOCIAL', 'ADOS_G_STEREO_BEHAV', 'ADOS_STEREO_BEHAV',
                      'ADOS_G_CREATIVITY','ADOS_CREATIVITY', 'ADOS_2_SOCAFFECT', 'ADOS_GOTHAM_SOCAFFECT',
                      'ADOS_2_RRB', 'ADOS_GOTHAM_RRB', 'ADOS_2_TOTAL', 'ADOS_GOTHAM_TOTAL', 'ADOS_2_SEVERITY_TOTAL', 
                      'ADOS_GOTHAM_SEVERITY']
#         self.ADOS = dict(zip(["ADOS_MODULE", "ADOS_RSRCH_RELIABLE"],
#                              [dictIn["ADOS_MODULE"], dictIn["ADOS_RSRCH_RELIABLE"]]))
        self.ADOS = dict(zip(ADOSfields, [dictIn[f] for f in ADOSfields]))
        self.ADIR = dict(zip(["ADI_R_SOCIAL_TOTAL_A", "ADI_R_VERBAL_TOTAL_BV", "ADI_R_RRB_TOTAL_C", "ADI_R_ONSET_TOTAL_D", "ADI_R_RSRCH_RELIABLE"],
                             [dictIn["ADI_R_SOCIAL_TOTAL_A"], dictIn["ADI_R_VERBAL_TOTAL_BV"], dictIn["ADI_R_RRB_TOTAL_C"], dictIn["ADI_R_ONSET_TOTAL_D"], dictIn["ADI_R_RSRCH_RELIABLE"]]))
        if self.ADIR['ADI_R_RRB_TOTAL_C'] == None: self.ADIR['ADI_R_RRB_TOTAL_C'] = dictIn["ADI_RRB_TOTAL_C"]
        self.DSM_IV_TR = dictIn["DSM_IV_TR"]
        self.DSM_IV_TR = dictIn["PDD_DSM_IV_TR"] if self.DSM_IV_TR is None else self.DSM_IV_TR
        self.currentMedStatus = dictIn["CURRENT_MED_STATUS"]
        self.medName = dictIn["MEDICATION_NAME"]
        self.offStimulantsAtScan = dictIn["OFF_STIMULANTS_AT_SCAN"]
        self.comorbidity = dictIn["COMORBIDITY"]
        self.comorbidity = dictIn["NONASD_PSYDX_LABEL"] if self.comorbidity == None else self.comorbidity
        self.comorbidity = None if self.comorbidity == 'none' or self.comorbidity == 'None' else self.comorbidity
        self.eyeStatus = dictIn["EYE_STATUS_AT_SCAN"] #1=open, #2=closed
        self.handedness = dictIn["HANDEDNESS_CATEGORY"] #1=right handed, 2=left handed, 3=mixed handed
        self.setAbideVersion()
        
    def setAbideVersion(self):
        if 'ABIDEII' in self.site: self.abideVersion = 2
        else: self.abideVersion = 1
            
    def __str__(self):
        return f"{self.partnum} {self.site}"
    

# Create objects

In [10]:
tabs = tabsA1 + tabsA2
#it is possible to extent or change these based on the fields in the phenotypic files. Beware that names can differ between
# ABIDE I and ABIDE II and sometimes the names have a typo, e.g. 'AGE_AT_SCAN ' should be 'AGE_AT_SCAN'
fieldsOfInterest = ['SITE_ID', 'SUB_ID', 'DX_GROUP', 'AGE_AT_SCAN', 'AGE_AT_SCAN ', 'SEX', 'FIQ', 'VIQ', 'PIQ', 
                   'FIQ_TEST_TYPE', 'VIQ_TEST_TYPE', 'PIQ_TEST_TYPE', 'ADOS_MODULE', 'ADOS_RSRCH_RELIABLE',
                    'ADI_R_SOCIAL_TOTAL_A', 'ADI_R_VERBAL_TOTAL_BV', 'ADI_R_RRB_TOTAL_C', 'ADI_RRB_TOTAL_C',
                    'ADI_R_ONSET_TOTAL_D',
                    'ADI_R_RSRCH_RELIABLE', 'DSM_IV_TR', 'CURRENT_MED_STATUS', 'MEDICATION_NAME',
                    'OFF_STIMULANTS_AT_SCAN', 'COMORBIDITY', 'NONASD_PSYDX_LABEL', 'PDD_DSM_IV_TR',
                    'ADOS_G_TOTAL', 'ADOS_TOTAL', 'ADOS_G_COMM', 'ADOS_COMM', 'ADOS_G_SOCIAL',
                    'ADOS_SOCIAL', 'ADOS_G_STEREO_BEHAV', 'ADOS_STEREO_BEHAV', 'ADOS_G_CREATIVITY',
                    'ADOS_CREATIVITY', 'ADOS_2_SOCAFFECT', 'ADOS_GOTHAM_SOCAFFECT', 'ADOS_2_RRB',
                    'ADOS_GOTHAM_RRB', 'ADOS_2_TOTAL', 'ADOS_GOTHAM_TOTAL', 'ADOS_2_SEVERITY_TOTAL', 
                    'ADOS_GOTHAM_SEVERITY', 'HANDEDNESS_CATEGORY', 'EYE_STATUS_AT_SCAN'] 
participants = []
for tab in tabs: #loop through sites
    header = tab[0] #header row 
    for row in tab[1:]: # skip header row for values
        itemsForObject = [] #list with values to initialize object
        for field in fieldsOfInterest:
            fieldInd = np.where(header==field) #obtain index of field
            fieldVal = row[fieldInd]           #obtain value at index
            if len(fieldVal) == 0: fieldVal = None #if value does not exist, set None
            else:                              #read value (it is a string)
                fieldVal = fieldVal.item()     
                if fieldVal.isnumeric():       #if numeric, parse to float
                    fieldVal = float(fieldVal)
                    if fieldVal.is_integer(): fieldVal = int(fieldVal) #if float is int, parse to int
            if fieldVal == -9999 or fieldVal == '-9999' or fieldVal == '': fieldVal = None #These entries are assumed not specified
            itemsForObject.append(fieldVal)
        participants.append(participant(dict(zip(fieldsOfInterest,itemsForObject))))

print("Number of participants: ", len(participants))  #check length
# participants is now a list of objects containig patient metadata

## read FD information
rowcounter = 0
rowEntries = []
with open(pathFD, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for row in csvreader:
        if rowcounter == 0: columnTitles = row #column titles
        else: 
            row[columnTitles.index('ID')] = int(row[columnTitles.index('ID')]) #ID is an int
            row[columnTitles.index('mean FD')] = float(row[columnTitles.index('mean FD')]) #mean FD is a float
            FDs = row[columnTitles.index('FDs')]
            row[columnTitles.index('FDs')] = np.array([float(i) for i in FDs.split(';')]) #parse string to numpy float array
            rowEntries.append(row)
        rowcounter = rowcounter + 1

## insert FD information into objects of participants        
for p in participants:
    FDrow = [r for r in rowEntries if r[columnTitles.index('ID')] == p.partnum][0] #check if the ID matches
    p.FDs = FDrow[columnTitles.index('FDs')] #should all FDs be necessary
    p.meanFD = FDrow[columnTitles.index('mean FD')] 

Number of participants:  900


In [11]:
#Participants is now a list of objects with fields containing phenotypic data.
#Example usage:
for p in participants[::50]:
    print(p.site, p.partnum)

CMU 50649
NYU 50998
NYU 51054
NYU 51105
SDSU 50195
STANFORD 51196
TRINITY 51142
UM_1 50340
UM_2 50405
YALE 50604
ABIDEII-EMC_1 29911
ABIDEII-GU_1 28747
ABIDEII-NYU_1 29181
ABIDEII-NYU_1 29231
ABIDEII-SDSU_1 28876
ABIDEII-SU_2 30179
ABIDEII-TCD_1 29120
ABIDEII-U_MIA_1 30233
