# Notebook to organize and filter validation.tsv files during CuBIDS

In [7]:
import pandas as pd
import os

validate_output_path = '~/Documents/GIT/grmpy_opendata/curation/04_cubids_curation'
validate_output = pd.read_csv(os.path.join(validate_output_path, 'v2_validation.tsv'), sep='\t')

filtered_df = validate_output[validate_output['location'].str.contains('nii', na=False)]

dwi_missing = sorted(filtered_df.loc[filtered_df['location'].str.contains('dwi', na=False), 'subCode'].dropna().astype(str).unique())
anat_missing = sorted(filtered_df.loc[filtered_df['location'].str.contains('anat', na=False), 'subCode'].dropna().astype(str).unique())
fmap_missing = sorted(filtered_df.loc[filtered_df['location'].str.contains('fmap', na=False), 'subCode'].dropna().astype(str).unique())
func_missing = sorted(filtered_df.loc[filtered_df['location'].str.contains('func', na=False), 'subCode'].dropna().astype(str).unique())
perf_missing = sorted(filtered_df.loc[filtered_df['location'].str.contains('perf', na=False), 'subCode'].dropna().astype(str).unique())


missing_dict = {
    'dwi_missing': dwi_missing,
    'anat_missing': anat_missing,
    'fmap_missing': fmap_missing,
    'func_missing': func_missing,
    'perf_missing': perf_missing
}

# Create a DataFrame, aligning rows by filling shorter columns with None
max_length = max(len(lst) for lst in missing_dict.values())
missing_df = pd.DataFrame({key: lst + [""] * (max_length - len(lst)) for key, lst in missing_dict.items()})
missing_df.to_csv(os.path.join(validate_output_path, 'v2_missing_sidecar.tsv'), sep='\t', index=False)

missing_df

Unnamed: 0,dwi_missing,anat_missing,fmap_missing,func_missing,perf_missing
0,DwellTime,DwellTime,DwellTime,AcquisitionDuration,AcquisitionVoxelSize
1,GradientSetType,EffectiveEchoSpacing,EffectiveEchoSpacing,CogAtlasID,BackgroundSuppression
2,InstitutionalDepartmentName,GradientSetType,GradientSetType,CogPOID,CASLType
3,InversionTime,InstitutionalDepartmentName,InstitutionalDepartmentName,DelayAfterTrigger,DwellTime
4,MRTransmitCoilSequence,InversionTime,InversionTime,DelayTime,EffectiveEchoSpacing
5,MTState,MRTransmitCoilSequence,MRTransmitCoilSequence,DwellTime,GradientSetType
6,MixingTime,MTState,MTState,GradientSetType,InstitutionalDepartmentName
7,NumberShots,MixingTime,MixingTime,InstitutionalDepartmentName,IntendedFor
8,ParallelAcquisitionTechnique,MultibandAccelerationFactor,MultibandAccelerationFactor,Instructions,InversionTime
9,ParallelReductionFactorInPlane,NumberShots,NumberShots,InversionTime,LabelingDistance
