In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(rc={'image.cmap': 'coolwarm'})

from numba import jit,prange

import time
import os

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
def remove_triangle(df):
    # Remove triangle of a symmetric matrix and the diagonal
    
    df = df.astype(float)
    df.values[np.triu_indices_from(df, k=1)] = np.nan
    df  = ((df.T).values.reshape((1,(df.shape[0])**2)))
    df = df[~np.isnan(df)]
    df = df[df!=1]
    return (df).reshape((1,len(df)))

# Ingestion

In [3]:
path = r'/Users/rodrigo/Post-Grad/CC400/ADHD200_CC400_TCs_filtfix'

phenotypic = pd.DataFrame()
files = os.listdir(path)
files.sort()

df = pd.DataFrame()
#phenotypic = pd.DataFrame(index=pd.MultiIndex.from_tuples([(,'Subject')], names=['Institution', 'Subject']))


# Dataset LOOP
for dataset in files[1:-1]:
    print(dataset)
    path = r'/Users/rodrigo/Post-Grad/CC400/ADHD200_CC400_TCs_filtfix'
    path = os.path.join(path,dataset)
    files = os.listdir(path)
    files.sort()
    start = time.time()
    
    # Subject LOOP
    for subject in files[1:]:
        print(subject)
        
        # Saving phenotypic data
        if str(subject)[-3:] == 'csv':
            if str(subject)[-14:] == 'phenotypic.csv':
                df_aux = pd.read_csv(os.path.join(path,subject))
                df_aux[subject[:-15]] = str(subject[:-15])
                df_aux = df_aux.set_index([subject[:-15],'ScanDir ID'], inplace=False)
                phenotypic = pd.concat([df_aux,phenotypic], ignore_index=False)
            
        # Saving run/subject data (Try xARRAY)
        
        else:
            # Run LOOP
            for run in os.listdir(os.path.join(path,subject)):
                if run != '.DS_Store':
                    #print(run)
                    tuples = [(dataset,subject,run)]
                    teste = pd.read_csv(str(os.path.join(path,subject)) + '/' + run, sep='\t')
                    #print(teste.shape)
                    if teste.shape[0] != 0:
                        # df = pd.concat([df, pd.DataFrame(((teste.corr(method='pearson')).values).reshape((1,351*351)),
                        #                      index=pd.MultiIndex.from_tuples(tuples, names=['Institution', 'Subject', 'Run']))])
                        df = pd.concat([df, pd.DataFrame((remove_triangle(teste.corr(method='pearson'))),
                                             index=pd.MultiIndex.from_tuples(tuples, names=['Institution', 'Subject', 'Run']))])

    end = time.time()
    print((end - start)/60)
    
phenotypic.index.names=['Institution','Subject']
phenotypic.to_csv(r'/Users/rodrigo/Post-Grad/CC400/phenotypic.csv', header=True)
df.to_csv(r'/Users/rodrigo/Post-Grad/CC400/corr_matrices.csv', header=True)

KKI
1018959
1019436
1043241
1266183
1535233
1541812
1577042
1594156
1623716
1638334
1652369
1686265
1692275
1735881
1779922
1842819
1846346
1873761
1962503
1988015
1996183
2014113
2018106
2026113
2081148
2104012
2138826
2299519
2344857
2360428
2371032
2554127
2558999
2572285
2601925
2618929
2621228
2640795
2641332
2703289
2740232
2768273
2822304
2903997
2917777
2930625
3103809
3119327
3154996
3160561
3170319
3310328
3434578
3486975
3519022
3611827
3699991
3713230
3813783
3884955
3902469
3912996
3917422
3972472
3972956
4104523
4154182
4275075
4362730
4601682
5216908
6346605
6453038
7129258
7415617
7774305
8083695
8263351
8337695
8432725
8628223
8658218
9922944
KKI_motion.csv
KKI_phenotypic.csv
0.11487748225529988
NYU
0010002
0010003
0010004
0010005
0010006
0010007
0010008
0010009
0010010
0010011
0010012
0010013
0010014
0010015
0010016
0010017
0010018
0010019
0010020
0010021
0010022
0010023
0010024
0010025
0010026
0010027
0010028
0010029
0010030
0010031
0010032
0010033
0010034
0010035
00

## Validation 

In [None]:
phenotypic

In [None]:
#df = df.dropna(axis=1)
df