# Duplicate T1w Images

Here we come up with a strategy for removing duplicate T1w images.

In [1]:
from bids import BIDSLayout

layout = BIDSLayout('/cbica/projects/RBC/HRC/working/BIDS', validate=False)



In [2]:
layout

BIDS Layout: .../projects/RBC/HRC/working/BIDS | Subjects: 608 | Sessions: 905 | Runs: 751

Let's figure out how to track T1s.

In [3]:
df = layout.to_df()

In [7]:
df.head()

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
0,/cbica/projects/RBC/HRC/working/BIDS/dataset_d...,,json,,,,,description,
1,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,json,refaced,1.0,1.0,10001.0,T1w,
2,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,nii.gz,refaced,1.0,1.0,10001.0,T1w,
3,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,dwi,bval,,1.0,1.0,10001.0,dwi,
4,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,dwi,bvec,,1.0,1.0,10001.0,dwi,


In [8]:
df_filtered = df[['path', 'extension', 'suffix', 'run', 'subject', 'session']].query('suffix == "T1w" & extension == "nii.gz"').drop('extension', axis=1)
grp = df_filtered.groupby(['subject', 'session'])
grp_sorted = grp.apply(lambda x: x.sort_values(["run"], ascending = False))

If we arrange these by run:

In [10]:
grp_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,entity,path,suffix,run,subject,session
subject,session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10001,1,2,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,T1w,1,10001,1
10001,2,10,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,T1w,1,10001,2
10612,1,18,/cbica/projects/RBC/HRC/working/BIDS/sub-10612...,T1w,1,10612,1
10615,1,26,/cbica/projects/RBC/HRC/working/BIDS/sub-10615...,T1w,1,10615,1
10615,2,34,/cbica/projects/RBC/HRC/working/BIDS/sub-10615...,T1w,1,10615,2


Now, we can check which subjects have more than one run of T1w:

In [11]:
import qgrid

qgrid_widget = qgrid.show_grid(df_filtered, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [12]:
df2 = df_filtered.copy()

df2['cs'] = df_filtered.groupby(['subject', 'session']).cumcount()+1

qgrid_widget = qgrid.show_grid(df2, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [13]:
keeps = df2.groupby(['subject', 'session']).tail(1)

In [14]:
qgrid_widget = qgrid.show_grid(keeps, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

Let's write this out to file just incase:

In [15]:
keeps.to_csv('t1s_to_keep.csv', index=False)

And now we just get the list of T1s that we don't want against this list

In [35]:
removes = df2.iloc[~df2.index.isin(list(keeps.index)),:]

In [36]:
removes.shape

(116, 6)

In [38]:
qgrid_widget = qgrid.show_grid(removes, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

Now we need to gather these files and their jsons:

In [41]:
niftis_to_remove = list(removes['path'].values)

In [43]:
niftis_to_remove[:10]

['/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-2_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10620/ses-1/anat/sub-10620_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10634/ses-1/anat/sub-10634_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10646/ses-1/anat/sub-10646_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10648/ses-1/anat/sub-10648_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10648/ses-1/anat/sub-10648_ses-1_rec-refaced_run-2_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10650/ses-1/anat/sub-10650_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/HRC/working/BID

In [45]:
jsons_to_remove = [x.replace('.nii.gz', '.json') for x in niftis_to_remove]
jsons_to_remove[:10]

['/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-2_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10620/ses-1/anat/sub-10620_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10634/ses-1/anat/sub-10634_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10646/ses-1/anat/sub-10646_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10648/ses-1/anat/sub-10648_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10648/ses-1/anat/sub-10648_ses-1_rec-refaced_run-2_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10650/ses-1/anat/sub-10650_ses-1_rec-refaced_run-1_T1w.json',
 '/cbica/projects/RBC/HRC/working/BIDS/sub-10654/ses-1/

In [51]:
files_to_remove = list(zip(niftis_to_remove, jsons_to_remove))
files_to_remove[:5]

[('/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.nii.gz',
  '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.json'),
 ('/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-2_T1w.nii.gz',
  '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-2_T1w.json'),
 ('/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.nii.gz',
  '/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.json'),
 ('/cbica/projects/RBC/HRC/working/BIDS/sub-10620/ses-1/anat/sub-10620_ses-1_rec-refaced_run-1_T1w.nii.gz',
  '/cbica/projects/RBC/HRC/working/BIDS/sub-10620/ses-1/anat/sub-10620_ses-1_rec-refaced_run-1_T1w.json'),
 ('/cbica/projects/RBC/HRC/working/BIDS/sub-10634/ses-1/anat/sub-10634_ses-1_rec-refaced_run-1_T1w.nii.gz',
  '/cbica/projects/RBC/HRC/worki

In [71]:
len(files_to_remove)

116

Now we remove:

In [55]:
import os

In [57]:
for acq in files_to_remove:
    
    if all([os.path.exists(acq[0]), os.path.exists(acq[1])]):
        
        #print("os.remove({})".format(acq[0]))
        #print("os.remove({})".format(acq[1]))
        os.remove(acq[0])
        os.remove(acq[1])
    else:
        print("Could not find files!")
        print(acq)

Now let's update the BIDS data:

In [58]:
layout2 = BIDSLayout('/cbica/projects/RBC/HRC/working/BIDS', validate=False)

df_updated = layout2.to_df()



In [69]:
df_updated.query('subject == "10618" & suffix == "T1w"')

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
49,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,json,refaced,3,1,10618,T1w,
50,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,nii.gz,refaced,3,1,10618,T1w,
57,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,json,refaced,2,2,10618,T1w,
58,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,nii.gz,refaced,2,2,10618,T1w,


We can see that this subject has two sessions each with only one T1w.

Last, we rename the T1s so that they're named `run-1`.

In [73]:
to_rename = df_updated.query('suffix == "T1w" & run > 1')
to_rename.head()

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
49,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,json,refaced,3,1,10618,T1w,
50,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,nii.gz,refaced,3,1,10618,T1w,
57,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,json,refaced,2,2,10618,T1w,
58,/cbica/projects/RBC/HRC/working/BIDS/sub-10618...,anat,nii.gz,refaced,2,2,10618,T1w,
65,/cbica/projects/RBC/HRC/working/BIDS/sub-10620...,anat,json,refaced,2,1,10620,T1w,


In [75]:
to_rename.sample()

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
261,/cbica/projects/RBC/HRC/working/BIDS/sub-10650...,anat,json,refaced,2,1,10650,T1w,


Now we just rename each of these.

In [92]:
def rename(row, test=True):
    
    path = row['path']
    run = row['run']
    replace = 'run-' + str(run)
    new_name = path.replace(replace, 'run-1')
    
    if test:
        print("renaming:{} --> {}".format(path, new_name))
    else:
        os.rename(path, new_name)

In [96]:
# use iterrows because we don't want to return anything
# other wise: 
# to_rename.iloc[:10,].apply(lambda row: rename(row, test=True), axis=1)

for i, row in to_rename.iloc[:10,].iterrows():
    rename(row, test=True)

renaming:/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-3_T1w.json --> /cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.json
renaming:/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-3_T1w.nii.gz --> /cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-1/anat/sub-10618_ses-1_rec-refaced_run-1_T1w.nii.gz
renaming:/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-2_T1w.json --> /cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.json
renaming:/cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-2_T1w.nii.gz --> /cbica/projects/RBC/HRC/working/BIDS/sub-10618/ses-2/anat/sub-10618_ses-2_rec-refaced_run-1_T1w.nii.gz
renaming:/cbica/projects/RBC/HRC/working/BIDS/sub-10620/ses-1/anat/sub-10620_ses-1_rec-refaced_run-2_T1w.json --> /cbica/projects/RB

In [97]:
for i, row in to_rename.iterrows():
    rename(row, test=False)

And the final check:

In [98]:
layout3 = BIDSLayout('/cbica/projects/RBC/HRC/working/BIDS', validate=False)

df_final = layout3.to_df()



In [99]:
# all T1s
df_final.query('suffix == "T1w"')

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
1,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,json,refaced,1,1,10001,T1w,
2,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,nii.gz,refaced,1,1,10001,T1w,
9,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,json,refaced,1,2,10001,T1w,
10,/cbica/projects/RBC/HRC/working/BIDS/sub-10001...,anat,nii.gz,refaced,1,2,10001,T1w,
17,/cbica/projects/RBC/HRC/working/BIDS/sub-10612...,anat,json,refaced,1,1,10612,T1w,
...,...,...,...,...,...,...,...,...,...
6819,/cbica/projects/RBC/HRC/working/BIDS/sub-21858...,anat,nii.gz,refaced,1,2,21858,T1w,
6826,/cbica/projects/RBC/HRC/working/BIDS/sub-21913...,anat,json,refaced,1,2,21913,T1w,
6827,/cbica/projects/RBC/HRC/working/BIDS/sub-21913...,anat,nii.gz,refaced,1,2,21913,T1w,
6834,/cbica/projects/RBC/HRC/working/BIDS/sub-21927...,anat,json,refaced,1,2,21927,T1w,


In [100]:
# this should be empty
df_final.query('suffix == "T1w" & run > 1')

entity,path,datatype,extension,reconstruction,run,session,subject,suffix,task
