In [1]:
import numpy as np
import pandas as pd
import os

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## load brain data

In [3]:
indir = '/mmfs1/data/liacz/Documents/Bilingualism_CVAE/data/array_brains'
# indir = '/mmfs1/data/liacz/Documents/Bilingualism_CVAE/data/array_brains/unaligned'

BRAIN_data = np.empty([0,64,64,64])
BRAIN_subs = np.empty([0,])
BRAIN_ds = np.empty([0,])
for dat in os.listdir(indir): 
    if dat.startswith('Anat-Bilingual-64iso'):
        arr = np.load(os.path.join(indir,dat))
        BRAIN_data = np.concatenate((BRAIN_data, arr['data']), axis=0)
        BRAIN_subs = np.concatenate((BRAIN_subs, arr['subs']), axis=0)
        BRAIN_ds = np.concatenate((BRAIN_ds, [dat.split('.')[0].split('-')[-2]]*len(arr['subs'])), axis=0)

In [4]:
BRAIN_data.shape

(2499, 64, 64, 64)

## load demographic information

In [6]:
indir = os.path.expanduser('~/Documents/Bilingualism_CVAE/data/legend')
outdir = os.path.expanduser('~/Documents/Bilingualism_CVAE/data/legend.csv')
fns = [fn for fn in os.listdir(indir) if fn.endswith('.csv')]
legend = pd.DataFrame()
for fn in fns:
    df_temp = pd.read_csv(os.path.join(indir,fn))
    print(len(df_temp))
    legend = legend.append(df_temp, ignore_index=True)

legend.loc[legend['ds']=='HCPaging', 'participant_id'] = legend.loc[legend['ds']=='HCPaging', 'participant_id']+'_V1_MR'
legend.loc[legend['ds']=='HCPdev', 'participant_id'] = legend.loc[legend['ds']=='HCPdev', 'participant_id']+'_V1_MR'
legend.loc[legend['ds']=='ping', 'ds'] = 'Ping'
# legend.loc[legend['ds']=='ping', 'ds'] = 'ping'

legend.loc[legend['ds']=='Hernandez', 'participant_id'] = legend.loc[legend['ds']=='Hernandez', 'participant_id']+'.mgz'


legend['ds_par_id'] = legend['ds'] +'_'+ legend['participant_id']
legend.to_csv(outdir, index=False)

92
652
363
724
1493


  legend = legend.append(df_temp, ignore_index=True)
  legend = legend.append(df_temp, ignore_index=True)
  legend = legend.append(df_temp, ignore_index=True)
  legend = legend.append(df_temp, ignore_index=True)
  legend = legend.append(df_temp, ignore_index=True)


## align two datasets

In [7]:
BRAIN_ds_subs = [BRAIN_ds[i]+'_'+BRAIN_subs[i] for i in range(0,len(BRAIN_subs))]
legend_subs = legend.ds_par_id

In [8]:
print(len(legend_subs))
print(legend_subs[3300])
print(legend_subs[1800])
print(legend_subs[900])

3324
Ping_sub-P0358
HCPaging_HCA9640889_V1_MR
Hernandez_WL_025.mgz


In [9]:
BRAIN_ds_subs[700]

'Ping_sub-P1693'

In [10]:
# BRAIN_ds_subs

In [11]:
subs_legend = [sub in BRAIN_ds_subs for sub in legend_subs]
subs_BRAIN = [sub in list(legend_subs) for sub in BRAIN_ds_subs]

In [12]:
print(len(subs_legend))
print(len(subs_BRAIN))

3324
2499


In [13]:
BRAIN_data = BRAIN_data[subs_BRAIN]
BRAIN_subs = BRAIN_subs[subs_BRAIN]
BRAIN_ds = BRAIN_ds[subs_BRAIN]

In [14]:
legend = legend[subs_legend]

In [15]:
print(len(BRAIN_data))
print(len(legend))

2497
2497


## save data

In [16]:
outdir = os.path.expanduser('~/Documents/Bilingualism_CVAE/data/array_brains')
np.save(os.path.join(outdir, ("arr_combine/BRAIN_data")), BRAIN_data)
np.save(os.path.join(outdir, ("arr_combine/BRAIN_subs")), BRAIN_subs)
np.save(os.path.join(outdir, ("arr_combine/BRAIN_ds")), BRAIN_ds)

In [17]:
len(legend)

2497

In [18]:
legend.to_csv(
    os.path.expanduser('~/Documents/Bilingualism_CVAE/data/legend.csv'),
    index = False)

## create bilingual and monolingual data

In [19]:
#BRAIN_data = np.array(BRAIN_data)
nsubs = BRAIN_data.shape[0]
print([arr.shape for arr in [BRAIN_ds, BRAIN_subs, BRAIN_data]])
print(nsubs)
print((BRAIN_data.min(),BRAIN_data.max()))

[(2497,), (2497,), (2497, 64, 64, 64)]
2497
(0.0, 1.0)


In [20]:
df = pd.read_csv('/mmfs1/data/liacz/Documents/Bilingualism_CVAE/data/legend.csv')
BI_subs = df.loc[df['bilingualism'].values == 1.0, 'ds_par_id']
MO_subs = df.loc[df['bilingualism'].values == 0.0, 'ds_par_id']
BRAIN_ds_subs = [BRAIN_ds[i]+'_'+BRAIN_subs[i] for i in range(0,len(BRAIN_subs))]
BI_subs = [sub in BI_subs.to_list() for sub in BRAIN_ds_subs]
MO_subs = [sub in MO_subs.to_list() for sub in BRAIN_ds_subs]

In [21]:
print(BI_subs.count(True))
print(MO_subs.count(True))

853
1644


In [22]:
TD_subs = BRAIN_data[MO_subs,:,:,:] # Data of Monolinguals 
print(TD_subs.shape)

(1644, 64, 64, 64)


In [23]:
DX_subs = BRAIN_data[BI_subs,:,:,:] # Data of Bilinguals
print(DX_subs.shape)

(853, 64, 64, 64)


In [24]:
outdir = os.path.expanduser('~/Documents/Bilingualism_CVAE/data/array_brains')
np.save(os.path.join(outdir, ("arr_combine/TD_subs")), TD_subs)
np.save(os.path.join(outdir, ("arr_combine/DX_subs")), DX_subs)