In [1]:
import os
import json
import requests
from tqdm import tqdm
from collections import defaultdict, Counter
from sparc.client import SparcClient

# === Setup ===
client = SparcClient(connect=False, config_file='config.ini')

# === Step 1: Run Elasticsearch-style query to get a known ID range ===
ids = list(range(0, 1001))
id_strings = [f'"{i}"' for i in ids]
id_list_str = ", ".join(id_strings)

body = f'''
{{
  "size": 1000,
  "query": {{
    "terms": {{
      "_id": [ {id_list_str} ]
    }}
  }}
}}
'''
body_json = json.loads(body)
response = client.metadata.search_datasets(body_json)

# === Step 2: Filter and categorize by type ===
type_counter = Counter()
type_id_map = defaultdict(list)

for d in response["hits"]["hits"]:
    try:
        type_name = d["_source"]["item"]["types"][0]["name"]
    except (KeyError, IndexError, TypeError):
        type_name = "<invalid or missing type>"

    type_counter[type_name] += 1
    dataset_id = d.get("_id", "<no id>")
    type_id_map[type_name].append(dataset_id)

# === Step 3: Show summary ===
print("Dataset type counts:")
for t, count in type_counter.items():
    print(f"{t}: {count}")

print("\nProcessing only type = 'dataset'")
dataset_ids = type_id_map["dataset"]
print(f"Found {len(dataset_ids)} datasets.\n")

# === Step 4: Process only datasets and collect extensions ===
dataset_paths = []
non_dataset_count = 0
error_count = 0
extension_counter = Counter()

for dataset_id in tqdm(dataset_ids, desc="Processing datasets"):
    # === Fetch metadata ===
    metadata_url = f"https://api.pennsieve.io/discover/datasets/{dataset_id}/versions/1/metadata"
    try:
        meta_response = requests.get(metadata_url)
        meta_response.raise_for_status()
        metadata = meta_response.json()
    except Exception as e:
        tqdm.write(f"[ERROR] Metadata fetch failed for dataset {dataset_id}: {e}")
        error_count += 1
        continue

    # === Extract 'files/primary/' paths and extensions ===
    primary_paths = [
        f.get("path") for f in metadata.get("files", [])
        if f.get("path", "").startswith("files/primary/")
    ]

    if primary_paths:
        dataset_paths.append((dataset_id, primary_paths))
        for path in primary_paths:
            _, ext = os.path.splitext(path)
            ext = ext.lower() if ext else "<no extension>"
            extension_counter[ext] += 1

        tqdm.write(f"  Primary file paths for dataset {dataset_id}: {primary_paths}")
    else:
        tqdm.write(f"  No files under 'files/primary/' for dataset {dataset_id}.")
        non_dataset_count += 1

# === Final Summary ===
tqdm.write("\n=== Summary ===")
tqdm.write(f"Total datasets queried: {len(dataset_ids)}")
tqdm.write(f"Metadata errors: {error_count}")
tqdm.write(f"Datasets without primary files: {non_dataset_count}")
tqdm.write(f"Datasets with primary files: {len(dataset_paths)}")

  __import__("pkg_resources").declare_namespace(__name__)


Dataset type counts:
dataset: 240
scaffold: 42
computational model: 47
<invalid or missing type>: 7
device: 1

Processing only type = 'dataset'
Found 240 datasets.



Processing datasets:   0%|          | 1/240 [00:01<05:30,  1.38s/it]

  Primary file paths for dataset 120: ['files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0001.2020.02.13.10.29.10.109375.7722025.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0002.2020.02.13.10.29.10.109375.7722041.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0003.2020.02.13.10.29.10.109375.7722057.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0004.2020.02.13.10.29.10.109375.7722073.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0005.2020.02.13.10.29.10.109375.7722089.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0006.2020.02.13.10.29.10.109375.7722105.IMA', 'files/primary/sub-cat9/sam-1_cat9/DICOM 3D CT pre surgical bladder/DAMASER.XA._.0001.0007.2020.02.13.10.29.10.109375.7722121.IMA', 'files/primary/sub-cat9/sam-1_cat9/DI

Processing datasets:   1%|          | 2/240 [00:02<04:06,  1.04s/it]

  Primary file paths for dataset 121: ['files/primary/sub-11136/sam-11136_20180913_151209/11136_20180913_151209-manifest.csv', 'files/primary/sub-11136/sam-11136_20180913_151209/11136_20180913_151209.tar.bz2', 'files/primary/sub-11150/sam-11150_20180921_152855/11150_20180921_152855-manifest.csv', 'files/primary/sub-11150/sam-11150_20180921_152855/11150_20180921_152855.tar.bz2', 'files/primary/sub-11151/sam-11151_20180921_145143/11151_20180921_145143-manifest.csv', 'files/primary/sub-11151/sam-11151_20180921_145143/11151_20180921_145143.tar.bz2', 'files/primary/sub-11176/sam-11176_20190205_154731/11176_20190205_154731-manifest.csv', 'files/primary/sub-11176/sam-11176_20190205_154731/11176_20190205_154731.tar.bz2', 'files/primary/sub-11178/sam-11178_20181012_144516/11178_20181012_144516-manifest.csv', 'files/primary/sub-11178/sam-11178_20181012_144516/11178_20181012_144516.tar.bz2', 'files/primary/sub-11196/sam-11196_20181025_152914/11196_20181025_152914-manifest.csv', 'files/primary/sub

Processing datasets:   1%|▏         | 3/240 [00:02<03:24,  1.16it/s]

  No files under 'files/primary/' for dataset 115.


Processing datasets:   2%|▏         | 4/240 [00:03<03:07,  1.26it/s]

  Primary file paths for dataset 124: ['files/primary/compression/sub-20180809_G5/sam-20180809_G5/ISP_20180809_G5.xlsx', 'files/primary/compression/sub-20180813_G5/sam-20180813_G5/ISP_20180813_G5.xlsx', 'files/primary/compression/sub-20180821_G5/sam-20180821_G5/ISP_20180821_G5.xlsx', 'files/primary/compression/sub-20180827_G3/sam-20180827_G3/ISP_20180827_G3.xlsx', 'files/primary/compression/sub-20180828_G2/sam-20180828_G2/ISP_20180828_G2.xlsx', 'files/primary/compression/sub-20180905_G1/sam-20180905_G1/ISP_20180905_G1.xlsx', 'files/primary/compression/sub-20180906_G3/sam-20180906_G3/ISP_20180906_G3.xlsx', 'files/primary/compression/sub-20180911_G5/sam-20180911_G5/ISP_20180911_G5.xlsx', 'files/primary/compression/sub-20180912_G5/sam-20180912_G5/ISP_20180912_G5.xlsx', 'files/primary/compression/sub-20181009_G5/sam-20181009_G5/ISP_20181009_G5.xlsx', 'files/primary/compression/sub-20191010_G1/sam-20191010_G1/ISP_20181010_G1.xlsx', 'files/primary/compression/sub-20181018_G1/sam-20181018_G1/

Processing datasets:   2%|▏         | 5/240 [00:04<03:05,  1.27it/s]

  Primary file paths for dataset 109: ['files/primary/pop-B6/sam-B6_Tibia/BL6_Section10_85PercentSite_DistalMetaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section11_90PercentSite_DistalMetaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section1_3PercentSite_ProximalEpiphysis.tif', 'files/primary/pop-B6/sam-B6_Femur/BL6_Section2_12PercentSite_DistalMetaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section2_5PercentSite_ProximalMetaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section3_10PercentSite_ProximalMetaphysis.tif', 'files/primary/pop-B6/sam-B6_Femur/BL6_Section3_45PercentSite_MidDiaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section4_20PercentSite_ProximalDiaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section5_30PercentSite_MidDiaphysis.tif', 'files/primary/pop-B6/sam-B6_Femur/BL6_Section5_80PercentSite_ProximalDiaphysis.tif', 'files/primary/pop-B6/sam-B6_Tibia/BL6_Section6_40PercentSite_MidDiaphysis.tif', 'files/primary/pop-B6/sam-B6_Ti

Processing datasets:   2%|▎         | 6/240 [00:05<03:03,  1.28it/s]

  Primary file paths for dataset 114: ['files/primary/sub-pig2/DamaserSPARC_Ian_PressCalib2-11_6_2018-6_05_06 AM.tsv', 'files/primary/sub-pig2/DamaserSPARC_Laparotomy_pressure_trial-11_19_2018-11_01_12 AM.tsv', 'files/primary/sub-pig2/DamaserSPARC_Laparotomy_pressure_trial-StoolMovement-DIFF-C-V-11_19_2018-11_40_35 AM.tsv', 'files/primary/sub-pig2/DamaserSPARC_Laparotomy_pressure_trial-abdomenClosed-11_19_2018-11_19_11 AM.tsv', 'files/primary/sub-pig2/DamaserSPARC_Laparotomy_pressure_trial-empty-out-stool-H-C-Stool-11_19_2018-12_21_37 PM.tsv', 'files/primary/sub-pig2/DamaserSPARC_Laparotomy_pressure_trial-empty-out-stool2-11_19_2018-12_08_56 PM.tsv', 'files/primary/sub-pig1/DamaserSPARC_PigStudy_11_05_18_AbdomenPush_trial-11_5_2018-10_29_30 AM.tsv', 'files/primary/sub-pig1/DamaserSPARC_PigStudy_11_05_18__100_trial7_SecondArtificialStool-11_5_2018-12_59_45 PM.tsv', 'files/primary/sub-pig1/DamaserSPARC_PigStudy_11_05_18__1055_trial2_StoolCalibration-11_5_2018-11_04_28 AM.tsv', 'files/pri

Processing datasets:   3%|▎         | 7/240 [00:05<03:04,  1.27it/s]

  No files under 'files/primary/' for dataset 12.


Processing datasets:   3%|▎         | 8/240 [00:06<03:07,  1.24it/s]

  Primary file paths for dataset 107: ['files/primary/sub-10865/sam-10865_20180510_151333/10865_20180510_151333-manifest.csv', 'files/primary/sub-10865/sam-10865_20180510_151333/10865_20180510_151333.tar.bz2', 'files/primary/sub-10872/sam-10872_20180517_144120/10872_20180517_144120-manifest.csv', 'files/primary/sub-10872/sam-10872_20180517_144120/10872_20180517_144120.tar.bz2', 'files/primary/sub-10872/sam-10872_20180517_153335/10872_20180517_153335-manifest.csv', 'files/primary/sub-10872/sam-10872_20180517_153335/10872_20180517_153335.tar.bz2', 'files/primary/sub-10873/sam-10873_20180517_151337/10873_20180517_151337-manifest.csv', 'files/primary/sub-10873/sam-10873_20180517_151337/10873_20180517_151337.tar.bz2', 'files/primary/sub-10893/sam-10893_20180601_135703/10893_20180601_135703-manifest.csv', 'files/primary/sub-10893/sam-10893_20180601_135703/10893_20180601_135703.tar.bz2', 'files/primary/sub-10908/sam-10908_20180607_141304/10908_20180607_141304-manifest.csv', 'files/primary/sub

Processing datasets:   4%|▍         | 9/240 [00:07<03:06,  1.24it/s]

  Primary file paths for dataset 158: ['files/primary/sub-20191031M1/sam-nNOS_Hu_M1/2. 20191031M1_dC_SMP_nNOS_Hu_20x.lsm', 'files/primary/sub-20191031M1/sam-PG9.5_M1/20171107M1_dC_PGP9.5_10x.lsm', 'files/primary/sub-20191031M1/sam-PG9.5_M1/20171107M1_pC_PGP9.5_1_10x.lsm', 'files/primary/sub-20191031M2/sam-NF_M2/20171107M2_dC_NF_10x.lsm', 'files/primary/sub-20191031M2/sam-NF_M2/20171107M2_pC_NF_1_10x.lsm', 'files/primary/sub-20191031M2/sam-nNOS_NF_M2/20171107M2_pC_nNos_NF_10x.lsm', 'files/primary/sub-20191031M1/sam-CGRP_M1/20191031M1_dC_MP_CGRP_5_10x.lsm', 'files/primary/sub-20191031M1/sam-CGRP_M1/20191031M1_dC_MP_CGRP_6_10x.lsm', 'files/primary/sub-20191031M1/sam-VIP_M1/20191031M1_dC_SMP_VIP_3_10x.lsm', 'files/primary/sub-20191031M1/sam-nNOS_Hu_M1/20191031M1_dC_SMP_nNOS_Hu_20x.lsm', 'files/primary/sub-20191031M1/sam-nNOS_Hu_M1/20191031M1_nNOS_Hu_dC_MP_1_20x.lsm', 'files/primary/sub-20191031M1/sam-nNOS_Hu_M1/20191031M1_nNOS_Hu_dC_MP_2_20x.lsm', 'files/primary/sub-20191031M1/sam-nNOS_Hu_

Processing datasets:   4%|▍         | 10/240 [00:08<02:58,  1.29it/s]

  Primary file paths for dataset 159: ['files/primary/sub-4/sam-2017_05_08_0001a/2017_05_08_0001a.cxd', 'files/primary/sub-4/sam-2017_05_08_0001b/2017_05_08_0001b.cxd', 'files/primary/sub-4/sam-2017_05_08_0004/2017_05_08_0004.cxd', 'files/primary/sub-5/sam-2017_05_12_0003/2017_05_12_0003.cxd', 'files/primary/sub-5/sam-2017_05_12_0004/2017_05_12_0004.cxd', 'files/primary/sub-6/sam-2017_05_15_0001b/2017_05_15_0001b.cxd', 'files/primary/sub-6/sam-2017_05_15_0002/2017_05_15_0002.cxd', 'files/primary/sub-6/sam-2017_05_15_0003/2017_05_15_0003.cxd', 'files/primary/sub-6/sam-2017_05_15_0004/2017_05_15_0004.cxd', 'files/primary/sub-6/sam-2017_05_15_0005/2017_05_15_0005.cxd', 'files/primary/sub-6/sam-2017_05_15_0008/2017_05_15_0008.cxd', 'files/primary/sub-10/sam-2017_05_19_0003/2017_05_19_0003.cxd', 'files/primary/sub-7/sam-2017_05_19_0006/2017_05_19_0006.cxd', 'files/primary/sub-11/sam-2017_05_22_0001/2017_05_22_0001.cxd', 'files/primary/sub-11/sam-2017_05_22_0003/2017_05_22_0003.cxd', 'files/

Processing datasets:   5%|▍         | 11/240 [00:09<03:39,  1.05it/s]

  Primary file paths for dataset 163: ['files/primary/sub-pnpig190129/pnpig012919-190129-174519/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-173521/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-174225/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-174631/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-174337/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-151044/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-151511/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-151718/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-151310/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-174053/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-150513/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-150903/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-150345/IZp2.csv', 'files/primary/sub-pnpig190129/pnpig012919-190129-145340/IZp2.csv', 'files/pr

Processing datasets:   5%|▌         | 12/240 [00:10<03:18,  1.15it/s]

  Primary file paths for dataset 164: ['files/primary/sub-IF-25/IF25_2019_05_06_data.smrx', 'files/primary/sub-IF-27/IF27_2019_05_22_data.smrx', 'files/primary/sub-IF-29/IF29_2019_05_29_data.smrx', 'files/primary/sub-IF-30/IF30_2019_07_12_data.smrx', 'files/primary/sub-IF-31/IF31_2019_09_23_data.smrx', 'files/primary/sub-IF-32/IF32_2019_09_30_data.smrx', 'files/primary/sub-IF-34/IF34_2019_10_7_data.smrx', 'files/primary/sub-IM-29/IM29_2019_07_31_Data.smrx', 'files/primary/sub-IM-30/IM30_2019_08_02_Data.smrx', 'files/primary/sub-IM-31/IM31_2019_08_05_Data.smrx', 'files/primary/sub-IM-32/IM32_2019_08_09_SpikeData.smrx', 'files/primary/sub-IM-33/IM33_2019_8_12_data.smrx', 'files/primary/sub-IM-34/IM34_2019_08_14_data.smrx', 'files/primary/sub-IM-35/IM35_2019_08_19_data-second-part.smrx', 'files/primary/sub-IM-35/IM35_2019_08_19_data.smrx', 'files/primary/sub-IM-36/IM36_2019_8_23_data.smrx', 'files/primary/sub-IM-37/IM37_2019_10_09_data.smrx', 'files/primary/sub-IM-38/IM38_2019_16_10_data.

Processing datasets:   5%|▌         | 13/240 [00:10<03:02,  1.24it/s]

  Primary file paths for dataset 160: ['files/primary/sub-20190807P_1ACT/sam-20190807P_1_pC_ActD/20190807P_pC-with-ActD_S1_R1_001.fastq.gz', 'files/primary/sub-20190807P_1ACT/sam-20190807P_1_pC_ActD/20190807P_pC-without-ActD_S2_R1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L001_I1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L001_R1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L001_R2_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L002_I1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L002_R1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L002_R2_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L003_I1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-20190807P_1_pC/20190807P_pC_S1_L003_R1_001.fastq.gz', 'files/primary/sub-20190807P_1/sam-2019080

Processing datasets:   6%|▌         | 14/240 [00:11<02:51,  1.32it/s]

  Primary file paths for dataset 169: ['files/primary/sub-C2_104/perf-C2-104-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C2_102/perf-C2-102-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1_101/perf-C1-101-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1_103/perf-C1-103-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C2_103/perf-C2-103-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1_104/perf-C1-104-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1_102/perf-C1-102-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C2_101/perf-C2-101-prestimulation-1/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C2_104/perf-C2-104-stimulation-1/MRI_timeinfo_stimulation_1.mat', 'files/primary/sub-C2_102/perf-C2-102-stimulation-1/MRI_timeinfo_stimulation_1.mat', 'files/primary/sub-C1_101/perf-C1-101-stimulation-1/MRI_timeinf

Processing datasets:   6%|▋         | 15/240 [00:12<02:46,  1.35it/s]

  No files under 'files/primary/' for dataset 16.


Processing datasets:   7%|▋         | 16/240 [00:12<02:41,  1.39it/s]

  Primary file paths for dataset 168: ['files/primary/sub-animal-7/sam-animal-7/Combined_GFP_RFP.tif', 'files/primary/sub-animal-4/sam-animal-4/Combined_GFP_RFP.tif', 'files/primary/sub-animal-5/sam-animal-5/GFP.tif', 'files/primary/sub-animal-3/sam-animal-3/GFP.tif', 'files/primary/sub-animal-7/sam-animal-7/GFP.tif', 'files/primary/sub-animal-1/sam-animal-1/GFP.tif', 'files/primary/sub-animal-2/sam-animal-2/GFP.tif', 'files/primary/sub-animal-4/sam-animal-4/GFP.tif', 'files/primary/sub-animal-8/sam-animal-8/GFP.tif', 'files/primary/sub-animal-3/sam-animal-3/RFP.tif', 'files/primary/sub-animal-7/sam-animal-7/RFP.tif', 'files/primary/sub-animal-1/sam-animal-1/RFP.tif', 'files/primary/sub-animal-2/sam-animal-2/RFP.tif', 'files/primary/sub-animal-4/sam-animal-4/RFP.tif', 'files/primary/sub-animal-8/sam-animal-8/RFP.tif', 'files/primary/manifest.xlsx']


Processing datasets:   7%|▋         | 17/240 [00:13<02:29,  1.49it/s]

  Primary file paths for dataset 152: ['files/primary/pool-1/Body composition data.xlsx', 'files/primary/pool-1/Body weight data.xlsx', 'files/primary/pool-1/Fatty acid oxidation data.xlsx', 'files/primary/pool-1/Glucose tolerance test data.xlsx', 'files/primary/pool-1/Insulin%2C leptin%2C glucagon assay data.xlsx', 'files/primary/pool-1/Metabolic data.xlsx', 'files/primary/manifest.xlsx']


Processing datasets:   8%|▊         | 18/240 [00:14<03:12,  1.16it/s]

  Primary file paths for dataset 229: ['files/primary/sub-pnpig190122/190122_P742_Right_Vagus_Cervical_Vagus.tif', 'files/primary/sub-pnpig190122/190122_P742_Right_Vagus_Nodose_Ganglion.tif', 'files/primary/sub-pnpig190123/190123_P734_Left_Vagus_Cranial_Vagus.tif', 'files/primary/sub-pnpig190123/190123_P734_Left_Vagus_Nodose_Ganglion.tif', 'files/primary/sub-pnpig190123/190123_P734_Right_Vagus_Cranial_Vagus.tif', 'files/primary/sub-pnpig190123/190123_P734_Right_Vagus_Nodose_Ganglion.tif', 'files/primary/sub-pnpig190129/190129_P753_Right_Vagus_Cervical_Vagus.tif', 'files/primary/sub-pnpig190129/190129_P753_Right_Vagus_Cervical_Vagus_2.tif', 'files/primary/sub-pnpig190220/190220_P786_Right_Vagus_Caudal_LivaNova_Contact.tif', 'files/primary/sub-pnpig190220/190220_P786_Right_Vagus_Center_of_LivaNova_Cuff.tif', 'files/primary/sub-pnpig190220/190220_P786_Right_Vagus_Cranial_LivaNova_Contact.tif', 'files/primary/sub-pnpig190109/pnpig010919-190109-150038/IZp2.csv', 'files/primary/sub-pnpig1901

Processing datasets:   8%|▊         | 19/240 [00:15<03:00,  1.23it/s]

  Primary file paths for dataset 230: ['files/primary/sub-dorsal-1/sam-CGRP-Mouse-Dorsal-1/3D_scaffold_-_CGRP-Mice-Dorsal-1.xml', 'files/primary/sub-dorsal-2/sam-CGRP-Mouse-Dorsal-2/3D_scaffold_-_CGRP-Mice-Dorsal-2.xml', 'files/primary/sub-dorsal-3/sam-CGRP-Mouse-Dorsal-3/3D_scaffold_-_CGRP-Mice-Dorsal-3.xml', 'files/primary/sub-dorsal-4/sam-CGRP-Mouse-Dorsal-4/3D_scaffold_-_CGRP-Mice-Dorsal-4.xml', 'files/primary/sub-ventral-1/sam-CGRP-Mouse-Ventral-1/3D_scaffold_-_CGRP-Mice-Ventral-1.xml', 'files/primary/sub-ventral-2/sam-CGRP-Mouse-Ventral-2/3D_scaffold_-_CGRP-Mice-Ventral-2.xml', 'files/primary/sub-ventral-3/sam-CGRP-Mouse-Ventral-3/3D_scaffold_-_CGRP-Mice-Ventral-3.xml', 'files/primary/sub-dorsal-1/sam-CGRP-Mouse-Dorsal-1/Confocal_microscopy_montage_of_CGRP-Mouse-Dorsal-1.png', 'files/primary/sub-dorsal-2/sam-CGRP-Mouse-Dorsal-2/Confocal_microscopy_montage_of_CGRP-Mouse-Dorsal-2.png', 'files/primary/sub-dorsal-3/sam-CGRP-Mouse-Dorsal-3/Confocal_microscopy_montage_of_CGRP-Mouse-Dor

Processing datasets:   8%|▊         | 20/240 [00:16<03:06,  1.18it/s]

  No files under 'files/primary/' for dataset 22.


Processing datasets:   9%|▉         | 21/240 [00:16<02:46,  1.32it/s]

  Primary file paths for dataset 220: ['files/primary/sub-001/sam-001/20_B6_WT_S17_L008_R1_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S17_L008_R2_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S18_L008_R1_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S18_L008_R2_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S19_L008_R1_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S19_L008_R2_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S20_L008_R1_001.fastq.gz.gz', 'files/primary/sub-001/sam-001/20_B6_WT_S20_L008_R2_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S5_L007_R1_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S5_L007_R2_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S6_L007_R1_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S6_L007_R2_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S7_L007_R1_001.fastq.gz.gz', 'files/primary/sub-002/sam-002/B6_15_S7_L007_R2_001.fastq.gz.gz', 'file

Processing datasets:   9%|▉         | 22/240 [00:18<03:03,  1.19it/s]

  Primary file paths for dataset 236: ['files/primary/sub-1/Exp25_Trial_0001.nev', 'files/primary/sub-1/Exp25_Trial_0001_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0001_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0002.nev', 'files/primary/sub-1/Exp25_Trial_0002_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0002_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0003.nev', 'files/primary/sub-1/Exp25_Trial_0003_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0003_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0004.nev', 'files/primary/sub-1/Exp25_Trial_0004_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0004_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0005.nev', 'files/primary/sub-1/Exp25_Trial_0005_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0005_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0006.nev', 'files/primary/sub-1/Exp25_Trial_0006_(1).ns2', 'files/primary/sub-1/Exp25_Trial_0006_(2).ns5', 'files/primary/sub-1/Exp25_Trial_0007.nev', 'files/primary/sub-1/Exp25_Trial_0007_(1).ns2', 'files/primary/sub-1/Exp25_Tr

Processing datasets:  10%|▉         | 23/240 [00:19<03:35,  1.01it/s]

  Primary file paths for dataset 237: ['files/primary/sub-P117980/sam-P117980-Histology/111411.svs', 'files/primary/sub-P117980/sam-P117980-Histology/111412.svs', 'files/primary/sub-P117980/sam-P117980-Histology/111413.svs', 'files/primary/sub-P117980/sam-P117980-Histology/111414.svs', 'files/primary/sub-P117980/sam-P117980-Histology/111415.svs', 'files/primary/sub-P162287/sam-P162287-Histology/112336.svs', 'files/primary/sub-P162287/sam-P162287-Histology/112337.svs', 'files/primary/sub-P1509/perf-P1509-Cardiac-Electrophysiology/A1A2_pacing_420_180_decr10ms.inf', 'files/primary/sub-P1509/perf-P1509-Cardiac-Electrophysiology/A1A2_pacing_420_180_decr10ms.txt', 'files/primary/sub-P1508/perf-P1508-Cardiac-Electrophysiology/A1A2_pacing_450_180_decr10ms.inf', 'files/primary/sub-P1508/perf-P1508-Cardiac-Electrophysiology/A1A2_pacing_450_180_decr10ms.txt', 'files/primary/sub-P1512/perf-P1512-Cardiac-Electrophysiology/A1A2_pacing_470_410_decr20ms_400_200_decr10ms.inf', 'files/primary/sub-P1512/

Processing datasets:  10%|█         | 24/240 [00:20<03:18,  1.09it/s]

  Primary file paths for dataset 210: ['files/primary/sub-C1-104/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1-101/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1-103/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1-102/MRI_timeinfo_prestimulation_1.mat', 'files/primary/sub-C1-104/MRI_timeinfo_stimulation_1.mat', 'files/primary/sub-C1-101/MRI_timeinfo_stimulation_1.mat', 'files/primary/sub-C1-103/MRI_timeinfo_stimulation_1.mat', 'files/primary/sub-C1-102/MRI_timeinfo_stimulation_1.mat', 'files/primary/manifest.xlsx', 'files/primary/sub-C1-104/prestimulation_1.nii.gz', 'files/primary/sub-C1-101/prestimulation_1.nii.gz', 'files/primary/sub-C1-103/prestimulation_1.nii.gz', 'files/primary/sub-C1-102/prestimulation_1.nii.gz', 'files/primary/sub-C1-101/stimulation_1.nii.gz', 'files/primary/sub-C1-104/stimulation_1.nii.gz', 'files/primary/sub-C1-103/stimulation_1.nii.gz', 'files/primary/sub-C1-102/stimulation_1.nii.gz', 'files/primary/sub-C1-104/stimulation_sequ

Processing datasets:  10%|█         | 25/240 [00:21<03:25,  1.05it/s]

  Primary file paths for dataset 211: ['files/primary/sub-20171220-NTS-5/._20171220_VNSstiminfo.rtf', 'files/primary/sub-20171120-NTS-1/20171120_VNSstiminfo.rtf', 'files/primary/sub-20171206-NTS-2/20171206_VNSstiminfo.rtf', 'files/primary/sub-20171212-NTS-3/20171212_VNSstiminfo.rtf', 'files/primary/sub-20171215-NTS-4/20171215_VNSstiminfo.rtf', 'files/primary/sub-20171220-NTS-5/20171220_VNSstiminfo.rtf', 'files/primary/sub-20171206-NTS-2/VNS_forestomach_10_ch1.mat', 'files/primary/sub-20171120-NTS-1/VNS_forestomach_10_ch1.mat', 'files/primary/sub-20171206-NTS-2/VNS_forestomach_10_ch10.mat', 'files/primary/sub-20171120-NTS-1/VNS_forestomach_10_ch10.mat', 'files/primary/sub-20171206-NTS-2/VNS_forestomach_10_ch11.mat', 'files/primary/sub-20171120-NTS-1/VNS_forestomach_10_ch11.mat', 'files/primary/sub-20171206-NTS-2/VNS_forestomach_10_ch12.mat', 'files/primary/sub-20171120-NTS-1/VNS_forestomach_10_ch12.mat', 'files/primary/sub-20171206-NTS-2/VNS_forestomach_10_ch13.mat', 'files/primary/sub-

Processing datasets:  11%|█         | 26/240 [00:21<03:11,  1.12it/s]

  Primary file paths for dataset 216: ['files/primary/sub-201711292P/sam-PGP9-5-20171129P/20171129P-pC-1.2_ISP__PGP9.5_(ab108986)_10x.tif', 'files/primary/sub-20180705P/sam-HuCD-20180705P/20180705_Pig_MP_AC_centrifugal_Calbindin.tif', 'files/primary/sub-20180801P/sam-HuCD-20180801P/20180801_Pig_4_MP_Colon_Transverse_Hu_CD.tif', 'files/primary/sub-20180808P/sam-ChAT-20180808P/20180808_Pig_7_MP_Transverse_Colon_Chat.tif', 'files/primary/sub-20180824P/sam-ChAT-20180824P/20180824_Pig_11_OSP__-_Asc_Colon_Centrip_Chat.tif', 'files/primary/sub-20190121P/sam-nNOS-20190121P/20190121_Pig_14_ISP_DC_mouse_NOS_Airyscan.tif', 'files/primary/sub-20190209P/sam-nNOS-20190209P/20190209_Pig_14_Distal_Colon_ISP_-_GFAP.tif', 'files/primary/sub-20190509P/sam-SubP-20190509P/20190509_Pig_11_Centrifugal_ISP__rat_Substance_P.tif', 'files/primary/sub-20190813P/sam-ChAT-20190813P/20190813P_ChAT_(AB144p)_pC_MP_20x_3.tif', 'files/primary/sub-20190813P/sam-hpChAT-20190813P/20190813P_hpChAT_(Bellier_H3)_pC_MP_20x_9.t

Processing datasets:  11%|█▏        | 27/240 [00:22<02:54,  1.22it/s]

  Primary file paths for dataset 224: ['files/primary/sub-ChAT-Male-Subject-1/20_1021.acq', 'files/primary/sub-ChAT-Male-Subject-2/20_1022.acq', 'files/primary/sub-ChAT-Female-Subject-1/20_1023.acq', 'files/primary/sub-ChAT-Male-Subject-3/20_1027a.acq', 'files/primary/sub-ChAT-Female-Subject-2/20_1027b.acq', 'files/primary/sub-ChAT-Female-Subject-3/20_1028.acq', 'files/primary/sub-nNOS-Male-Subject-1/20_1202a.acq', 'files/primary/sub-nNOS-Male-Subject-2/20_1202b.acq', 'files/primary/sub-nNOS-Male-Subject-3/20_1203a.acq', 'files/primary/sub-nNOS-Female-Subject-1/20_1203b.acq', 'files/primary/sub-nNOS-Female-Subject-2/20_1207.acq', 'files/primary/sub-nNOS-Female-Subject-3/20_1208.acq', 'files/primary/manifest.xlsx']


Processing datasets:  12%|█▏        | 28/240 [00:23<02:52,  1.23it/s]

  Primary file paths for dataset 233: ['files/primary/manifest.xlsx', 'files/primary/sub-16/sam-1/sub-16_sam-1_ChAT_R16-1_1-200_20xp1.tif', 'files/primary/sub-16/sam-1/sub-16_sam-1_TH_R16-1_1-250_20xp1.tif', 'files/primary/sub-16/sam-3/sub-16_sam-3_ChAT_R16-3_1-200_20xp1.tif', 'files/primary/sub-16/sam-3/sub-16_sam-3_TH_R16-3_1-250_20xp2.tif', 'files/primary/sub-17/sam-1/sub-17_sam-1_ChAT_R17-1_1-200_20xp1.tif', 'files/primary/sub-17/sam-1/sub-17_sam-1_TH_R17-1_1-250_20xp1.tif', 'files/primary/sub-17/sam-3/sub-17_sam-3_ChAT_R17-3_1-200_20xp1.tif', 'files/primary/sub-17/sam-3/sub-17_sam-3_TH_R17-3_1-250_20xp1.tif', 'files/primary/sub-18/sam-1/sub-18_sam-1_ChAT_R18-1_1-200_20xp1.tif', 'files/primary/sub-18/sam-1/sub-18_sam-1_TH_R18-1_1-250_20xp1.tif', 'files/primary/sub-19/sam-1/sub-19_sam-1_ChAT_NPcontrol_R19-1p1_20x.tif', 'files/primary/sub-19/sam-1/sub-19_sam-1_TH_NPcontrol_R19-1p1_20x.tif', 'files/primary/sub-20/sam-1/sub-20_sam-1_ChAT_R20-1_1-200_20xp1.tif', 'files/primary/sub-20/sa

Processing datasets:  12%|█▏        | 29/240 [00:23<02:36,  1.35it/s]

  Primary file paths for dataset 264: ['files/primary/sub-20171107-MTF43ChAT-Optical-mapping/sam-3/20171107_-_MTF43_ChAT-IRES-Cre-ChR2-EYFP', 'files/primary/sub-20171107-WT-Optical-mapping/sam-1/20171107_-_WT_optical_mapping', 'files/primary/sub-20171108-MTF44ChAT-Optical-mapping/sam-4/20171108_-_MTF44_ChAT-IRES-Cre-ChR2-EYFP_-_Optical_mapping', 'files/primary/sub-20171108-MTF45-Optical-mapping/sam-5/20171108_-_MTF45_ChAT-IRES-Cre-ChR2-EYFP_-_Optical', 'files/primary/sub-20171108-WT-Optical-mapping/sam-2/20171108_-_WT_-_Optical_mapping', 'files/primary/sub-20171109-MTF46-Optical-mapping/sam-6/20171109_-_MTF46_ChAT-ChR2.s2rx', 'files/primary/sub-20171109-MTF46-Optical-mapping/sam-6/20171109_-_MTF46_ChAT-ChR2.smrx', 'files/primary/sub-20171109-MTF46-Optical-mapping/sam-6/20171109_-_MTF46_ChAT-IRES-_A39', 'files/primary/manifest.xlsx']


Processing datasets:  12%|█▎        | 30/240 [00:24<02:29,  1.40it/s]

  Primary file paths for dataset 265: ['files/primary/sub-SL1040F/sam-SL1040F-CG11B-1/SL1040F_CG11B-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG2A-1/SL1335M_CG2A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG2B-1/SL1335M_CG2B-2.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG3A-1/SL1335M_CG3A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG7A-1/SL1335M_CG7A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG7B-1/SL1335M_CG7B-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG9A-1/SL1335M_CG9A-1.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG2B-1/SL1354M_CG2B-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-CG2B-1/SL1354M_CG2B-1.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG3A-1/SL1354M_CG3A-1.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG3B-1/SL1354M_CG3B-1.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG3B-1/SL1354M_CG3B-2.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG5A-1/SL1354M_CG5A-1.czi', 'files/primary/sub-SL1354M/sam-SL1354M-CG8A-1/SL1354M_CG8A-1.czi', 'files/primary/sub-SL

Processing datasets:  13%|█▎        | 31/240 [00:25<02:32,  1.37it/s]

  Primary file paths for dataset 266: ['files/primary/sub-SL1040F/sam-SL1040F-SCGL3A-1/SL1040F_SCGL3A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL-0/SL1335M_SCGL-0.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL10A-1/SL1335M_SCGL10A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL10B-1/SL1335M_SCGL10B-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL10B-1/SL1335M_SCGL10B-2.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL10B-1/SL1335M_SCGL10B-3.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL1C-1/SL1335M_SCGL1C-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL2B-1/SL1335M_SCGL2B-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL2B-1/SL1335M_SCGL2B-2.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL6A-1/SL1335M_SCGL6A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGL6A-1/SL1335M_SCGL6A-2.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGR4A-1/SL1335M_SCGR4A-1.czi', 'files/primary/sub-SL1335M/sam-SL1335M-SCGR4A-1/SL1335M_SCGR4A-2.czi', 'files/primary/sub-SL1354M/sam-SL1

Processing datasets:  13%|█▎        | 32/240 [00:26<02:36,  1.33it/s]

  Primary file paths for dataset 294: ['files/primary/manifest.xlsx', 'files/primary/sub-258/sam-hum258-corp-C/sam-hum258-corp-C_ses-hoechst405pgp9.5(488)vip555ghrelin647', 'files/primary/sub-258/sam-hum258-corp-C/sam-hum258-corp-C_ses-hoechst405pgp9.5(488)vip555ghrelin647-stitch', 'files/primary/sub-258/sam-hum258-corp-C/sam-hum258-corp-C_ses-hoechst405pgp9.5(488)vip55_RoiSet_100_Ghrelin_IR_cells', 'files/primary/sub-258/sam-hum258-corp-C/sam-hum258-corp-C_ses-hoechst405pgp9.5(488)vip55_RoiSet_All_Ghrelin_IR_cells', 'files/primary/sub-258/sam-hum258-fun-B/sam-hum258-fun-A_ses-hoechst405pgp9.5(488)vip555ghrelin647', 'files/primary/sub-258/sam-hum258-fun-B/sam-hum258-fun-A_ses-hoechst405pgp9.5(488)vip555ghrelin647-stitch', 'files/primary/sub-259/sam-hum259-corp-C/sam-hum259-corp-C_ses-hoechst405pgp9.5(488)vip555ghrelin647', 'files/primary/sub-259/sam-hum259-corp-C/sam-hum259-corp-C_ses-hoechst405pgp9.5(488)vip555ghrelin647-stitch', 'files/primary/sub-259/sam-hum259-corp-C/sam-hum259-cor

Processing datasets:  14%|█▍        | 33/240 [00:26<02:30,  1.37it/s]

  Primary file paths for dataset 295: ['files/primary/sub-10025/sam-10025/10025_P_13_1_2.xml', 'files/primary/sub-10047/sam-10047/10047_P_8_3_13.xml', 'files/primary/sub-10133/sam-10133/10133_P_12_3_6.xml', 'files/primary/sub-10163/sam-10163/10163_P_20_1_9.xml', 'files/primary/sub-10176/sam-10176/10176_P_10_1_11.xml', 'files/primary/sub-10207/sam-10207/10207_P_17_1_9.xml', 'files/primary/sub-10246/sam-10246/10246_P_13_1_6.xml', 'files/primary/sub-10284/sam-10284/10284_P_1_1_3.xml', 'files/primary/sub-10327/sam-10327/10327_P_15_1_3.xml', 'files/primary/sub-10401/sam-10401/10401_P_9_3_1.xml', 'files/primary/sub-10444/sam-10444/10444_P_9_3_13.xml', 'files/primary/sub-10476/sam-10476/10476_P_12_1_6.xml', 'files/primary/sub-10620/sam-10620/10620_P_6_1_13.xml', 'files/primary/sub-10656/sam-10656/10656_P_10_1_8.xml', 'files/primary/sub-10657/sam-10657/10657_P_8_1_6.xml', 'files/primary/sub-10768/sam-10768/10768_P_22_3_4.xml', 'files/primary/sub-10812/sam-10812/10812_AC_9_1_5.xml', 'files/prim

Processing datasets:  14%|█▍        | 34/240 [00:27<02:36,  1.32it/s]

  Primary file paths for dataset 288: ['files/primary/sub-170627/sam-1/perf-1-spontaneous-activity/170627_13', 'files/primary/sub-170627/sam-1/perf-1-spontaneous-activity/170627_21', 'files/primary/sub-170627/sam-1/perf-1-spontaneous-activity/170627_22', 'files/primary/sub-170627/sam-1/perf-1-spontaneous-activity/170627_25', 'files/primary/sub-170627/sam-1/perf-1-spontaneous-activity/170627_35', 'files/primary/sub-170629/sam-2/perf-2-spontaneous-activity/170629_1', 'files/primary/sub-170629/sam-2/perf-2-ICC/170629_11', 'files/primary/sub-170629/sam-2/perf-2-spontaneous-activity/170629_12', 'files/primary/sub-170629/sam-2/perf-2-ICC/170629_13', 'files/primary/sub-170629/sam-2/perf-2-spontaneous-activity/170629_14', 'files/primary/sub-170629/sam-2/perf-2-ICC/170629_15', 'files/primary/sub-170629/sam-2/perf-2-spontaneous-activity/170629_16', 'files/primary/sub-170629/sam-2/perf-2-ICC/170629_17', 'files/primary/sub-170629/sam-2/perf-2-spontaneous-activity/170629_18', 'files/primary/sub-170

Processing datasets:  15%|█▍        | 35/240 [00:28<02:39,  1.28it/s]

  Primary file paths for dataset 290: ['files/primary/sub-20180620H/sam-NF-20180620H/20180620H_S_ISP_NF_20X_28.tif', 'files/primary/sub-20180620H/sam-PGP9-5-20180620H/20180620H_S_ISP_PGP9.5_10X_28.tif', 'files/primary/sub-20180620H/sam-CGRP-20180620H/20180620H_S_MP_CGRP_20X_19.tif', 'files/primary/sub-20180620H/sam-ChAT-20180620H/20180620H_S_MP_ChAT_Yeboah_20X_23.tif', 'files/primary/sub-20180620H/sam-SP-20180620H/20180620H_S_MP_SP_20X_19.tif', 'files/primary/sub-20180711H/sam-TH-20180711H/20180711H_T_MP_TH_10x_26.tif', 'files/primary/sub-20180711H/sam-nNOS-20180711H/20180711H_T_MP_nNOS_sc648-10x_4.tif', 'files/primary/sub-20180711H/sam-cKit-20180711H/20180711H_T_cKit_10x_13.tif', 'files/primary/sub-20180711H/sam-ChAT-20180711H/20180711_H-T_MP_ChAT_AP144p_20x_43.tif', 'files/primary/sub-20180912H/sam-VIP-20180912H/20180912H_S_ISP_VIP_20x_17.tif', 'files/primary/sub-20180912H/sam-NOS-20180912H/20180912H_S_MP_Nos_sc5302_20x_1.tif', 'files/primary/sub-20181015H/sam-S100-20181015H/20181015

Processing datasets:  15%|█▌        | 36/240 [00:29<02:33,  1.33it/s]

  Primary file paths for dataset 273: ['files/primary/sub-11240/Experiment-Primary-Feeding-Data-11240.xlsx', 'files/primary/sub-11241/Experiment-Primary-Feeding-Data-11241.xlsx', 'files/primary/sub-11242/Experiment-Primary-Feeding-Data-11242.xlsx', 'files/primary/sub-11243/Experiment-Primary-Feeding-Data-11243.xlsx', 'files/primary/sub-11244/Experiment-Primary-Feeding-Data-11244.xlsx', 'files/primary/sub-11245/Experiment-Primary-Feeding-Data-11245.xlsx', 'files/primary/sub-11246/Experiment-Primary-Feeding-Data-11246.xlsx', 'files/primary/sub-11247/Experiment-Primary-Feeding-Data-11247.xlsx', 'files/primary/sub-11248/Experiment-Primary-Feeding-Data-11248.xlsx', 'files/primary/sub-11249/Experiment-Primary-Feeding-Data-11249.xlsx', 'files/primary/sub-11556/Experiment-Primary-Feeding-Data-11556.xlsx', 'files/primary/sub-11557/Experiment-Primary-Feeding-Data-11557.xlsx', 'files/primary/sub-11558/Experiment-Primary-Feeding-Data-11558.xlsx', 'files/primary/sub-11559/Experiment-Primary-Feeding

Processing datasets:  15%|█▌        | 37/240 [00:30<03:03,  1.11it/s]

  Primary file paths for dataset 274: ['files/primary/sub-5/sam-exp-005-02-02-18-Neuron-1/perf-exp-005-02-02-18-Neuron-1-Ephys/020218_0002.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-2/perf-exp-005-02-02-18-Neuron-2-Ephys/020218_0004.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-2/perf-exp-005-02-02-18-Neuron-2-Ephys/020218_0006.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-2/perf-exp-005-02-02-18-Neuron-2-Ephys/020218_0007.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-2/perf-exp-005-02-02-18-Neuron-2-Ephys/020218_0008.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-3/perf-exp-005-02-02-18-Neuron-3-Ephys/020218_0009.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-4/perf-exp-005-02-02-18-Neuron-4-Ephys/020218_0012.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-4/perf-exp-005-02-02-18-Neuron-4-Ephys/020218_0013.abf', 'files/primary/sub-5/sam-exp-005-02-02-18-Neuron-4/perf-exp-005-02-02-18-Neuron-4-Ephys/020218_0014.abf', 'files/

Processing datasets:  16%|█▌        | 38/240 [00:30<02:40,  1.26it/s]

  No files under 'files/primary/' for dataset 27.


Processing datasets:  16%|█▋        | 39/240 [00:32<02:54,  1.15it/s]

  Primary file paths for dataset 270: ['files/primary/sub-C1-102/perf-C1-102-volume/MRI_time_info.mat', 'files/primary/sub-C1-103/perf-C1-103-volume/MRI_time_info.mat', 'files/primary/sub-C2-103/perf-C2-103-volume/MRI_time_info.mat', 'files/primary/sub-C2-104/perf-C2-104-volume/MRI_time_info.mat', 'files/primary/sub-C1-101/perf-C1-101-volume/MRI_time_info.mat', 'files/primary/sub-C1-105/perf-C1-105-volume/MRI_time_info.mat', 'files/primary/sub-C1-104/perf-C1-104-volume/MRI_time_info.mat', 'files/primary/sub-C2-102/perf-C2-102-volume/MRI_time_info.mat', 'files/primary/sub-C2-101/perf-C2-101-volume/MRI_time_info.mat', 'files/primary/sub-C1-102/perf-C1-102-motility/MRI_time_info_T1w_1.mat', 'files/primary/sub-C1-103/perf-C1-103-motility/MRI_time_info_T1w_1.mat', 'files/primary/sub-C2-103/perf-C2-103-motility/MRI_time_info_T1w_1.mat', 'files/primary/sub-C2-104/perf-C2-104-motility/MRI_time_info_T1w_1.mat', 'files/primary/sub-C1-101/perf-C1-101-motility/MRI_time_info_T1w_1.mat', 'files/prim

Processing datasets:  17%|█▋        | 40/240 [00:32<02:50,  1.17it/s]

  Primary file paths for dataset 296: ['files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_10.17.47_FusionStitcher_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_10x.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_11.05.11_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_01.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_11.17.25_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_02.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_11.28.21_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_03.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_11.46.11_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_04.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_13.02.03_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_05.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_13.18.39_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_06.ims', 'files/primary/sub-TRPV1-CreRosa-2C/sam-1/2019-09-26_13.34.39_TRPV1-CreRosa_2C_VG_inj._with_AAV9_Flex_GFP_20x_07.ims

Processing datasets:  17%|█▋        | 41/240 [00:33<02:31,  1.31it/s]

  Primary file paths for dataset 282: ['files/primary/manifest.xlsx', 'files/primary/sub-20190813P/sam-20190813P-dC/p-dC_MG_in_naive_porcine_20190813P.fastq.gz', 'files/primary/sub-20190820P/sam-20190820P-dC/p-dC_MG_in_naive_porcine_20190820P.fastq.gz', 'files/primary/sub-20191105P/sam-20191105P-dC/p-dC_MG_in_naive_porcine_20191105P.fastq.gz', 'files/primary/sub-20181004P/sam-20181004P-dC/p-dC_MG_in_porcine_with_VNS_20181004P.fastq.gz', 'files/primary/sub-20181115P/sam-20181115P-dC/p-dC_MG_in_porcine_with_VNS_20181115P.fastq.gz', 'files/primary/sub-20190404P/sam-20190404P-dC/p-dC_MG_in_porcine_with_VNS_20190404P.fastq.gz', 'files/primary/sub-20190813P/sam-20190813P-pC/p-pC_MG_in_naive_porcine_20190813P.fastq.gz', 'files/primary/sub-20190820P/sam-20190820P-pC/p-pC_MG_in_naive_porcine_20190820P.fastq.gz', 'files/primary/sub-20191105P/sam-20191105P-pC/p-pC_MG_in_naive_porcine_20191105P.fastq.gz', 'files/primary/sub-20181004P/sam-20181004P-pC/p-pC_MG_in_porcine_with_VNS_20181004P.fastq.gz'

Processing datasets:  18%|█▊        | 42/240 [00:34<02:26,  1.35it/s]

  Primary file paths for dataset 291: ['files/primary/sub-Tension-M1/sam-Mp70-Tension-M1/21_0510_-_M_10-30s__20Hz_or_60s__2_Hz_VNS.acq', 'files/primary/sub-Calcium-nNOS-GCaMP6f-F1/sam-Fp80-nNOS-F1/21_0607-10x-Fp50-nNOSGCaMP6f-_1.117-0.05_-VNS_20Hz_10s_at_30s_into_video_006.avi', 'files/primary/sub-Calcium-ICC-GCaMP-F1/sam-Fp50-ICC-F1/21_0608-10x-Fp50-CAGGS-GCaMP3-_1.117-0.05_-VNS_20Hz_10s_at_30s_into_video_012.avi', 'files/primary/sub-Calcium-nNOS-GCaMP6f-F2/sam-Fp80-nNOS-F2/21_0608-20x-Fp50-nNOSGCaMP6f-_1.117-0.05_-VNS_20Hz_10s_at_30s_into_video_012.avi', 'files/primary/sub-Calcium-ICC-GCaMP-M1/sam-Mp50-ICC-M1/21_0609-20x-Mp50-CAGGS-GCaMP3-_1.117-0.05_-VNS_20Hz_10s_at_30s_into_video_016.avi', 'files/primary/sub-Calcium-ChAT-GCaMP6f-M1/sam-Mp50-ChAT-M1/21_0610-20x-Mp50-ChATGCaMP6_1.117-0.05_-VNS_2Hz_10s_at_30s_into_video_007.avi', 'files/primary/sub-Calcium-ICC-GCaMP-M2/sam-Mp50-ICC-M2/21_0611-20x-Mp50-CAGGS-GCaMP3-_1.117-0.05_-VNS_2Hz_10s_at_30s_into_video_004.avi', 'files/primary/sub

Processing datasets:  18%|█▊        | 43/240 [00:34<02:23,  1.37it/s]

  Primary file paths for dataset 268: ['files/primary/sub-R19-436/sam-R19-436-Sample-1/R19-436-Sample-1.xlsx', 'files/primary/sub-R19-438/sam-R19-438-Sample-1/R19-438-Sample-1.xlsx', 'files/primary/sub-R19-471/sam-R19-471-Sample-1/R19-471-Sample-1.xlsx', 'files/primary/sub-R19-472/sam-R19-472-Sample-1/R19-472-Sample-1.xlsx', 'files/primary/sub-R19-491/sam-R19-491-Sample-1/R19-491_JPF017_1.jpx', 'files/primary/sub-R19-491/sam-R19-491-Sample-1/R19-491_JPF017_1.xml.xml', 'files/primary/sub-R19-542/sam-R19-542-Sample-1/R19-542_JPF022_1.jpx', 'files/primary/sub-R19-542/sam-R19-542-Sample-1/R19-542_JPF022_1.xml', 'files/primary/sub-R19-543/sam-R19-543-Sample-1/R19-543_JPF0241_1.jpx', 'files/primary/sub-R19-543/sam-R19-543-Sample-1/R19-543_JPF0241_1.xml', 'files/primary/sub-R19-544/sam-R19-544-Sample-1/R19-544_JPF024_1.jpx', 'files/primary/sub-R19-544/sam-R19-544-Sample-1/R19-544_JPF024_1.xml', 'files/primary/sub-R20-593/sam-R20-593-Sample-1/R20-593_JPF037_1.jpx', 'files/primary/sub-R20-593/s

Processing datasets:  18%|█▊        | 44/240 [00:36<03:11,  1.03it/s]

  Primary file paths for dataset 381: ['files/primary/sub-M4JUN1201/sam-16/S1-R1-01.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-02.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-03.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-04.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-05.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-06.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-07.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R1-08.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-01.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-02.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-03.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-04.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-05.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-06.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-07.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R2-08.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R3-01.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R3-02.nd2', 'files/primary/sub-M4JUN1201/sam-16/S1-R3-0

Processing datasets:  19%|█▉        | 45/240 [00:39<05:03,  1.55s/it]

  Primary file paths for dataset 383: ['files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180122-PB-001/Excel/20180122-PB-001-1-c1-ch1-Syn.xlsx', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180122-PB-001/Excel/20180122-PB-001-1-c1-ch2-FG.xlsx', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180122-PB-001/Imaris/20180122-PB-001-1.ims', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180124-PB-021/Excel/20180124-PB-021-1-c1-ch1-Syn.xlsx', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180124-PB-021/Excel/20180124-PB-021-1-c1-ch2-FG.xlsx', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180124-PB-021/Imaris/20180124-PB-021-1.ims', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-015-SPB06-RMPG-TH/sam-20180124-PB-022/Excel/20180124-PB-022-1-c1-ch1-Syn.xlsx', 'files/primary/sub-keastlab-R16-015/sam-keastlab-R16-0

Processing datasets:  19%|█▉        | 46/240 [00:40<04:44,  1.47s/it]

  Primary file paths for dataset 366: ['files/primary/sub-rat67/sam-corpGC67/67101.jpg', 'files/primary/sub-rat67/sam-corpGC67/67101_Experiment-1478.czi', 'files/primary/sub-rat67/sam-corpGC67/67102.jpg', 'files/primary/sub-rat67/sam-corpGC67/67102_Experiment-1477.czi', 'files/primary/sub-rat67/sam-corpGC67/67103.jpg', 'files/primary/sub-rat67/sam-corpGC67/67103_Experiment-1479.czi', 'files/primary/sub-rat67/sam-corpGC67/67104.jpg', 'files/primary/sub-rat67/sam-corpGC67/67104_Experiment-1476.czi', 'files/primary/sub-rat67/sam-corpGC67/67105.jpg', 'files/primary/sub-rat67/sam-corpGC67/67106.jpg', 'files/primary/sub-rat67/sam-corpGC67/67106_Experiment-1480.czi', 'files/primary/sub-rat67/sam-corpGC67/67107.jpg', 'files/primary/sub-rat67/sam-corpGC67/67107_Experiment-1481.czi', 'files/primary/sub-rat67/sam-corpGC67/67108.jpg', 'files/primary/sub-rat67/sam-corpGC67/67108_Experiment-1475.czi', 'files/primary/sub-rat67/sam-corpGC67/67109.jpg', 'files/primary/sub-rat67/sam-corpGC67/67109_Exper

Processing datasets:  20%|█▉        | 47/240 [00:41<04:03,  1.26s/it]

  Primary file paths for dataset 368: ['files/primary/sub-1/sam-20170602-3/20170602.3.tif', 'files/primary/sub-2/sam-20170605-6/20170605.6.tif', 'files/primary/sub-3/sam-20171016-7/20171016.7.tif', 'files/primary/sub-5/sam-20171024-18/20171024.18.tif', 'files/primary/sub-4/sam-20171024-6/20171024.6.tif', 'files/primary/sub-7/sam-20171205-10/20171205.10.tif', 'files/primary/sub-6/sam-20171205-8/20171205.8.tif', 'files/primary/sub-8/sam-20180122-1/20180122.1.tif', 'files/primary/sub-9/sam-20180129-2/20180129.2.tif', 'files/primary/sub-27/sam-20180129-2-1/20180129.2.tif', 'files/primary/sub-20/sam-20180129-3-1/20180129.3.tif', 'files/primary/sub-10/sam-20180129-5/20180129.5.tif', 'files/primary/sub-53/sam-20180129-3/20180129_Motion_Corrected_Stream_3_Full_Movie.tif', 'files/primary/sub-11/sam-20180130-6/20180130.6.tif', 'files/primary/sub-21/sam-20180130-7/20180130.7.tif', 'files/primary/sub-12/sam-20180220-3/20180220.3.tif', 'files/primary/sub-71/sam-20180220-12/20180220_Motion_Corrected

Processing datasets:  20%|██        | 48/240 [00:42<03:50,  1.20s/it]

  Primary file paths for dataset 370: ['files/primary/manifest.xlsx', 'files/primary/sub-JM100/sam-JM100-1/RNA-seq/1-JM100-S1-R1-001-fastqc.zip', 'files/primary/sub-JM100/sam-JM100-1/RNA-seq/1-JM100-S1-R1-001-fastqc.html', 'files/primary/sub-JM100/sam-JM100-stomach/Macroscopy/20200609-173034.jpg', 'files/primary/sub-JM100/sam-JM100-stomach/Macroscopy/20200609-173423-HDR.jpg', 'files/primary/sub-JM100/sam-JM100-stomach/Macroscopy/jm100_20200609-173015.jpg', 'files/primary/sub-JM100/sam-JM100-stomach/microscopy/jm100v-16-1.czi', 'files/primary/sub-JM100/sam-JM100-stomach/microscopy/jm100v-29-1.czi', 'files/primary/sub-JM100/sam-JM100-stomach/microscopy/jm100v-27-2.czi', 'files/primary/sub-JM101/sam-JM101-1/RNA-seq/2-JM101-S1-R1-001-fastqc.html', 'files/primary/sub-JM100/sam-JM100-stomach/microscopy/jm100v-4-2.czi', 'files/primary/sub-JM101/sam-JM101-1/RNA-seq/2-JM101-S1-R1-001-fastqc.zip', 'files/primary/sub-JM101/sam-JM101-stomach/Macroscopy/20200609-135437.jpg', 'files/primary/sub-JM10

Processing datasets:  20%|██        | 49/240 [00:43<03:26,  1.08s/it]

  Primary file paths for dataset 371: ['files/primary/sub-rat15/sam-rat15-leftnod/AHT9_PERITLUM_LN_Experiment-411.czi', 'files/primary/sub-rat15/sam-rat15-rightnod/AHT9_PERITLUM_RN_Experiment-413.czi', 'files/primary/sub-rat01/sam-rat01-leftnod/JM079_M_CORMUS_LN_Experiment-1680_quant.czi', 'files/primary/sub-rat01/sam-rat01-rightnod/JM079_M_CORMUS_RN_Experiment-1720.czi', 'files/primary/sub-rat01/sam-rat01-stomach/JM079_M_GB_CORMUC_STOMACH.jpg', 'files/primary/sub-rat01/sam-rat01-stomach/JM079_M_RB_CORMUC_STOMACH.jpg', 'files/primary/sub-rat02/sam-rat02-leftnod/JM080_M_CORMUS_LN_Experiment-260_quant.czi', 'files/primary/sub-rat02/sam-rat02-rightnod/JM080_M_CORMUS_RN_Experiment-194.czi', 'files/primary/sub-rat02/sam-rat02-stomach/JM080_M_GB_CORMUC_STOMACH.jpg', 'files/primary/sub-rat02/sam-rat02-stomach/JM080_RB_CORMUS_STOMACH.jpg', 'files/primary/sub-rat03/sam-rat03-leftnod/JM081_M_CORMUS_VAG_LN_Experiment-142.czi', 'files/primary/sub-rat03/sam-rat03-rightnod/JM081_M_CORMUS_VAG_RN_Expe

Processing datasets:  21%|██        | 50/240 [00:43<03:10,  1.00s/it]

  Primary file paths for dataset 362: ['files/primary/sub-10/sam-RA-left-1/10_RA_left_atrium_TH.tif', 'files/primary/sub-10/sam-RA-right-1/10_RA_right_atrium_TH.tif', 'files/primary/sub-12/sam-CIH-left-1/12_CIH_left_atrium_TH.tif', 'files/primary/sub-12/sam-CIH-right-1/12_CIH_right_atrium_TH.tif', 'files/primary/sub-13/sam-RA-left-2/13_RA_left_atrium_TH.tif', 'files/primary/sub-13/sam-RA-right-2/13_RA_right_atrium_TH.tif', 'files/primary/sub-14/sam-RA-left-3/14_RA_left_atrium_TH.tif', 'files/primary/sub-14/sam-RA-right-3/14_RA_right_atrium_TH.tif', 'files/primary/sub-15/sam-CIH-left-2/15_CIH_left_atrium_TH.tif', 'files/primary/sub-15/sam-CIH-right-2/15_CIH_right_atrium_TH.tif', 'files/primary/sub-16/sam-CIH-left-3/16_CIH_left_atrium_TH.tif', 'files/primary/sub-16/sam-CIH-right-3/16_CIH_right_atrium_TH.tif', 'files/primary/sub-19/sam-RA-left-4/19_RA_left_atrium_TH.tif', 'files/primary/sub-19/sam-RA-right-4/19_RA_right_atrium_TH.tif', 'files/primary/sub-20/sam-RA-left-5/20_RA_left_atrium

Processing datasets:  21%|██▏       | 51/240 [00:44<02:41,  1.17it/s]

  No files under 'files/primary/' for dataset 37.


Processing datasets:  22%|██▏       | 52/240 [00:45<02:23,  1.31it/s]

  Primary file paths for dataset 351: ['files/primary/sub-P-1/GI_Lab_Volunteer_Study_001_MadgTest_v42.csv', 'files/primary/sub-P-2/GI_Lab_Volunteer_Study_002_Probe_35_MadgTest_v42.csv', 'files/primary/manifest.xlsx']


Processing datasets:  22%|██▏       | 53/240 [00:45<02:11,  1.42it/s]

  Primary file paths for dataset 377: ['files/primary/pool-1/Plus_Maze_test-EC-PFTox.xlsx', 'files/primary/pool-2/Plus_Maze_test-EC-hM3DGq_AcuteDCZ.xlsx', 'files/primary/pool-3/Plus_Maze_test-EC-hM3DGq_AloAcuteDCZ.xlsx', 'files/primary/manifest.xlsx']


Processing datasets:  22%|██▎       | 54/240 [00:47<02:53,  1.07it/s]

  Primary file paths for dataset 423: ['files/primary/sub-4598-P/sam-HSCR4-transition-zone1/100_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_MP1.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/101_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_MP2.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/102_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_MP3.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/103_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_MP4.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/104_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_SP1.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/105_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_SP2.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/106_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_SP3.czi', 'files/primary/sub-4598-P/sam-HSCR4-transition-zone1/107_-_20200729_4598-p_Seg1_huC_594_nNOS_488_ChAT_647_20X_SP4.czi', '

Processing datasets:  23%|██▎       | 55/240 [00:48<03:08,  1.02s/it]

  Primary file paths for dataset 425: ['files/primary/in-vivo-imaging/OA-PMX/sub-250123-2b-LL-P/0001-0166-0430.roi', 'files/primary/in-vivo-imaging/OA-PMX/sub-180123-1b-R-P/0001-0226-0595.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-240123-2a-L-S/0001-0252-0513.roi', 'files/primary/in-vivo-imaging/OA-PMX/sub-240123-2a-R-P/0001-0256-0530.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-180123-1b-L-S/0001-0258-0468.roi', 'files/primary/in-vivo-imaging/OA-PMX/sub-250123-2b-R-P/0001-0274-0571.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-180123-1b-RR-S/0001-0277-0454.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-250123-2b-L-S/0001-0288-0490.roi', 'files/primary/in-vivo-imaging/OA-PMX/sub-180123-1b-LR-P/0002-0204-0521.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-250123-2b-RR-S/0002-0309-0301.roi', 'files/primary/in-vivo-imaging/OA-PMX/sub-170123-1a-LR-P/0002-0368-0451.roi', 'files/primary/in-vivo-imaging/OA-SHAM/sub-240123-2a-LL-S/0002-0577-0279.roi', 'files/primary/in-vivo-im

Processing datasets:  23%|██▎       | 56/240 [00:48<02:50,  1.08it/s]

  Primary file paths for dataset 415: ['files/primary/sub-ARDS1/sam-ARDS1-TUJ/441b.tiff', 'files/primary/sub-ARDS2/sam-ARDS2-TUJ/447.tiff', 'files/primary/sub-ARDS2/sam-ARDS2-TUJ/447b.tiff', 'files/primary/sub-D175/sam-D175-PECAM-TH/D175_Pecam_TH.tif', 'files/primary/sub-D175/sam-D175-PECAM-TH/D175_Pecam_TH_2.tif', 'files/primary/sub-D175/sam-D175-Syn-SMA/D175_SMA_Syn.tif', 'files/primary/sub-D274/sam-D274-PECAM-TH/D274_COVID_Pecam_TH.tif', 'files/primary/sub-D274/sam-D274-PECAM-TH/D274_COVID_Pecam_TH_2.tif', 'files/primary/sub-D274/sam-D274-SMA-Syn/D274_COVID_SMA_Syn.tif', 'files/primary/sub-D274/sam-D274-SMA-Syn/D274_COVID_SMA_Syn_2.tif', 'files/primary/sub-D307/sam-D307-TUJ-TH/Project_D307TUJTHSeries004Snapshot_all1.tif', 'files/primary/sub-D307/sam-D307-TUJ-TH/Project_D307TUJTHSeries004Snapshot_all2.tif', 'files/primary/sub-D307/sam-D307-TUJ-TH/Project_D307TUJTHSeries004_z00.tif', 'files/primary/sub-D307/sam-D307-TUJ-TH/Project_D307TUJTHSeries004_z01.tif', 'files/primary/sub-D307/s

Processing datasets:  24%|██▍       | 57/240 [00:50<03:05,  1.01s/it]

  Primary file paths for dataset 416: ['files/primary/sub-R10/perf-R10-xray/R10_Dorsal.bmp', 'files/primary/sub-R10/perf-R10-xray/R10_Lateral.bmp', 'files/primary/sub-R1/perf-R1-xray/R1_Dorsal.bmp', 'files/primary/sub-R1/perf-R1-xray/R1_Lateral.bmp', 'files/primary/sub-R2/perf-R2-xray/R2_Dorsal.bmp', 'files/primary/sub-R2/perf-R2-xray/R2_Lateral.bmp', 'files/primary/sub-R3/perf-R3-xray/R3_Dorsal.bmp', 'files/primary/sub-R3/perf-R3-xray/R3_Lateral.bmp', 'files/primary/sub-R4/perf-R4-xray/R4_Dorsal.bmp', 'files/primary/sub-R4/perf-R4-xray/R4_Lateral.bmp', 'files/primary/sub-R5/perf-R5-xray/R5_Dorsal.bmp', 'files/primary/sub-R5/perf-R5-xray/R5_Lateral.bmp', 'files/primary/sub-R6/perf-R6-xray/R6_Dorsal.bmp', 'files/primary/sub-R6/perf-R6-xray/R6_Lateral.bmp', 'files/primary/sub-R7/perf-R7-xray/R7_Dorsal.bmp', 'files/primary/sub-R7/perf-R7-xray/R7_Lateral.bmp', 'files/primary/sub-R8/perf-R8-xray/R8_Dorsal.bmp', 'files/primary/sub-R8/perf-R8-xray/R8_Lateral.bmp', 'files/primary/sub-R9/perf-R

Processing datasets:  24%|██▍       | 58/240 [00:51<03:22,  1.11s/it]

  Primary file paths for dataset 418: ['files/primary/sub-NE-Injection3/perf-400ng-NE-Injection3/200.db3', 'files/primary/sub-NE-Injection3/perf-4000ng-NE-Injection3/2000.db3', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf.db3', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_1.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_10.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_11.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_12.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_13.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_14.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_15.tsv', 'files/primary/sub-NE-Injection2/perf-20000ng-NE-Injection2/20000ng_inf_SEGMENT_16.tsv', 'files/primary/sub-NE-Injection2/p

Processing datasets:  25%|██▍       | 59/240 [00:52<03:39,  1.21s/it]

  Primary file paths for dataset 419: ['files/primary/sub-160A/sam-120123-008/008.jp2', 'files/primary/sub-160A/sam-120123-008/008.tif', 'files/primary/sub-160A/sam-120123-008/008.vsi', 'files/primary/sub-152D/sam-120123-009/009.jp2', 'files/primary/sub-152D/sam-120123-009/009.tif', 'files/primary/sub-152D/sam-120123-009/009.vsi', 'files/primary/sub-189E/sam-010424/010424_189EIT_p7_20x__AMP.tif', 'files/primary/sub-439L/sam-010623Process821/010623_439L_IT_p22_20x_RGB_AMP.tif', 'files/primary/sub-447J/sam-010623Process823/010623_447J_IT_p7_20x_AMP.tif', 'files/primary/sub-432Q/sam-011223Process831/011223_432Q_IT_p22_20x_RGB_AMP.tif', 'files/primary/sub-440L/sam-011223Process833/011223_440L_IT_p22_20x_RGB_AMP.tif', 'files/primary/sub-466M/sam-011223Process832/011223_466M_IT_p7_20x_RGB_AMP.tif', 'files/primary/sub-468M/sam-011223Process835/011223_468M_IT_p7_20x_RGB_AMP.tif', 'files/primary/sub-431Q/sam-012023Process845/012023_431Q_p22_IT_20x_RGB_AMP.tif', 'files/primary/sub-452Q/sam-01202

Processing datasets:  25%|██▌       | 60/240 [00:54<03:49,  1.27s/it]

  Primary file paths for dataset 420: ['files/primary/QC-files/sam-LC-QC-pool-DIA-1/1895.d/52eaded1-4a1b-46e1-88ab-0c18232ae2d7_1.mcf', 'files/primary/QC-files/sam-LC-QC-pool-DIA-1/1895.d/52eaded1-4a1b-46e1-88ab-0c18232ae2d7_1.mcf_idx', 'files/primary/QC-files/sam-LC-QC-pool-DIA-1/1895.d/BackgroundLinePos.ami', 'files/primary/QC-files/sam-LC-QC-pool-DIA-1/1895.d/BackgroundProfPos.ami', 'files/primary/QC-files/sam-LC-QC-pool-DIA-1/1895.d/Heatmaps.ami', 'files/primary/sub-UTD-DN0123/sam-1906-1-ggl-2/1906.d/1906.m/InstrumentSetup.isset', 'files/primary/QC-files/sam-LC-QC-pool-DIA/1894.d/1894.m/InstrumentSetup.isset', 'files/primary/sub-UTD-DN0251/sam-1939-8-nr-2/1939.d/1939.m/InstrumentSetup.isset', 'files/primary/sub-UTD-DN0046/sam-1925-3-nr-2/1925.d/1925.m/InstrumentSetup.isset', 'files/primary/sub-UTD-DN0245/sam-1896-7-ggl-2/1896.d/1896.m/InstrumentSetup.isset', 'files/primary/sub-UTD-DN0123/sam-1898-1-ggl-1/1898.d/1898.m/InstrumentSetup.isset', 'files/primary/sub-UTD-DN0181/sam-1942-2

Processing datasets:  25%|██▌       | 61/240 [00:55<03:17,  1.10s/it]

  Primary file paths for dataset 429: ['files/primary/pool-1/Body_Weights.xlsx', 'files/primary/pool-1/Experiment_Primary_Feeding_Data.xlsx', 'files/primary/sub-12075/Experiment_Primary_Feeding_Data_12075.xlsx', 'files/primary/sub-12076/Experiment_Primary_Feeding_Data_12076.xlsx', 'files/primary/sub-12077/Experiment_Primary_Feeding_Data_12077.xlsx', 'files/primary/sub-12078/Experiment_Primary_Feeding_Data_12078.xlsx', 'files/primary/sub-12079/Experiment_Primary_Feeding_Data_12079.xlsx', 'files/primary/sub-12080/Experiment_Primary_Feeding_Data_12080.xlsx', 'files/primary/sub-12081/Experiment_Primary_Feeding_Data_12081.xlsx', 'files/primary/sub-12082/Experiment_Primary_Feeding_Data_12082.xlsx', 'files/primary/sub-12083/Experiment_Primary_Feeding_Data_12083.xlsx', 'files/primary/sub-12084/Experiment_Primary_Feeding_Data_12084.xlsx', 'files/primary/sub-12085/Experiment_Primary_Feeding_Data_12085.xlsx', 'files/primary/sub-12086/Experiment_Primary_Feeding_Data_12086.xlsx', 'files/primary/sub

Processing datasets:  26%|██▌       | 62/240 [00:55<02:59,  1.01s/it]

  Primary file paths for dataset 427: ['files/primary/pool-FN1/FN1_Genes_ReadCount.txt', 'files/primary/pool-FN1/FN1_S21_L007_R1.fastq.gz', 'files/primary/pool-FN1/FN1_S21_L007_R1.fastq.gz.md5', 'files/primary/pool-FN2/FN2_Genes_ReadCount.txt', 'files/primary/pool-FN2/FN2_S22_L007_R1.fastq.gz', 'files/primary/pool-FN2/FN2_S22_L007_R1.fastq.gz.md5', 'files/primary/pool-FN3/FN3_Genes_ReadCount.txt', 'files/primary/pool-FN3/FN3_S23_L007_R1.fastq.gz', 'files/primary/pool-FN3/FN3_S23_L007_R1.fastq.gz.md5', 'files/primary/pool-FT1/FT1_Genes_ReadCount.txt', 'files/primary/pool-FT1/FT1_S18_L007_R1.fastq.gz', 'files/primary/pool-FT1/FT1_S18_L007_R1.fastq.gz.md5', 'files/primary/pool-FT2/FT2_Genes_ReadCount.txt', 'files/primary/pool-FT2/FT2_S19_L007_R1.fastq.gz', 'files/primary/pool-FT2/FT2_S19_L007_R1.fastq.gz.md5', 'files/primary/pool-FT3/FT3_Genes_ReadCount.txt', 'files/primary/pool-FT3/FT3_S20_L007_R1.fastq.gz', 'files/primary/pool-FT3/FT3_S20_L007_R1.fastq.gz.md5', 'files/primary/pool-MT/MT

Processing datasets:  26%|██▋       | 63/240 [00:56<02:39,  1.11it/s]

  No files under 'files/primary/' for dataset 43.


Processing datasets:  27%|██▋       | 64/240 [00:57<02:55,  1.00it/s]

  Primary file paths for dataset 421: ['files/primary/sub-unknown-1/sam-axon-1/1.tif', 'files/primary/sub-unknown-1/sam-axon-1/2.tif', 'files/primary/sub-unknown-1/sam-axon-1/3.tif', 'files/primary/sub-unknown-1/sam-axon-1/4.tif', 'files/primary/sub-unknown-1/sam-axon-1/5.tif', 'files/primary/sub-unknown-1/sam-axon-1/6.tif', 'files/primary/sub-unknown-1/sam-axon-1/7.tif', 'files/primary/sub-unknown-1/sam-axon-1/8.tif', 'files/primary/sub-175/sam-175-right-atrium/sam-175-right-atrium-axon-1-segment-c/NEW-01_h0b0z0c0x0-3840y0-2160.tif', 'files/primary/sub-175/sam-175-right-atrium/sam-175-right-atrium-axon-1-segment-c/NEW-01_h0b0z10c0x0-3840y0-2160.tif', 'files/primary/sub-175/sam-175-right-atrium/sam-175-right-atrium-axon-1-segment-c/NEW-01_h0b0z11c0x0-3840y0-2160.tif', 'files/primary/sub-175/sam-175-right-atrium/sam-175-right-atrium-axon-1-segment-c/NEW-01_h0b0z12c0x0-3840y0-2160.tif', 'files/primary/sub-175/sam-175-right-atrium/sam-175-right-atrium-axon-1-segment-c/NEW-01_h0b0z13c0x0-3

Processing datasets:  27%|██▋       | 65/240 [01:00<04:13,  1.45s/it]

  Primary file paths for dataset 49: ['files/primary/sub-D6616/ses-2017_12_04/juyi6616.100', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.100', 'files/primary/sub-D6616/ses-2017_09_25/juyi6616.100', 'files/primary/sub-D6616/ses-2017_12_04/juyi6616.101', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.101', 'files/primary/sub-D6616/ses-2017_09_25/juyi6616.101', 'files/primary/sub-D6616/ses-2017_12_04/juyi6616.102', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.102', 'files/primary/sub-D6616/ses-2017_09_25/juyi6616.102', 'files/primary/sub-D6616/ses-2017_12_04/juyi6616.103', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.103', 'files/primary/sub-D6616/ses-2017_09_25/juyi6616.103', 'files/primary/sub-D6616/ses-2017_12_04/juyi6616.104', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.104', 'files/primary/sub-D6616/ses-2017_09_25/juyi6616.104', 'files/primary/sub-D6616/ses-2017_12_04/juyi6616.105', 'files/primary/sub-D6616/ses-2017_10_23/juyi6616.105', 'files/primary/sub-D6616/se

Processing datasets:  28%|██▊       | 66/240 [01:00<03:24,  1.17s/it]

  Primary file paths for dataset 143: ['files/primary/pool-plate1-TNF-ELISA-serum/LPS_CHR2_TNF_Reardon v2.csv', 'files/primary/pool-plate1-TNF-ELISA-serum/manifest.xlsx']


Processing datasets:  28%|██▊       | 67/240 [01:02<03:30,  1.22s/it]

  Primary file paths for dataset 145: ['files/primary/sub-cat3/sam-2_cat3/DICOMs for 3D CT Cat 3%2C day 14/DAMASER.XA._.0003.0001.2018.01.30.08.23.14.109375.2331689.IMA', 'files/primary/sub-cat3/sam-3_cat3/DICOMs 3D CT video Cat 3%2C day 30%2C 25ml contrast/DAMASER.XA._.0003.0001.2018.02.12.08.02.29.953125.2528297.IMA', 'files/primary/sub-cat3/sam-2_cat3/DICOMs for 3D CT Cat 3%2C day 14/DAMASER.XA._.0003.0002.2018.01.30.08.23.14.109375.2331705.IMA', 'files/primary/sub-cat3/sam-3_cat3/DICOMs 3D CT video Cat 3%2C day 30%2C 25ml contrast/DAMASER.XA._.0003.0002.2018.02.12.08.02.29.953125.2528313.IMA', 'files/primary/sub-cat3/sam-2_cat3/DICOMs for 3D CT Cat 3%2C day 14/DAMASER.XA._.0003.0003.2018.01.30.08.23.14.109375.2331721.IMA', 'files/primary/sub-cat3/sam-3_cat3/DICOMs 3D CT video Cat 3%2C day 30%2C 25ml contrast/DAMASER.XA._.0003.0003.2018.02.12.08.02.29.953125.2528329.IMA', 'files/primary/sub-cat3/sam-2_cat3/DICOMs for 3D CT Cat 3%2C day 14/DAMASER.XA._.0003.0004.2018.01.30.08.23.14.1

Processing datasets:  28%|██▊       | 68/240 [01:03<03:34,  1.25s/it]

  Primary file paths for dataset 125: ['files/primary/Tertiary_Plexus_density_from_immunohistochemistry/pool-Tertiary_Plexus_density_from_immunohistochemistry/LMMNs_Tertiary plexus fibre counts_SBv4.xlsx', 'files/primary/Tertiary_Plexus_density_from_immunohistochemistry/pool-Tertiary_Plexus_density_from_immunohistochemistry/README.txt', 'files/primary/LMMNs_Tenia NOS VIP DiI/README.txt', 'files/primary/LMMNs_Tenia NOS VIP DiI MICROGRAPHS/README.txt', 'files/primary/LMMNs_Inter-taenial ChAT NOS DiI MICROGRAPHS/README.txt', 'files/primary/LMMNs_Inter-taenial ChAT NOS DiI/README.txt', 'files/primary/LMMNs_Tenia ChAT NOS DiI/README.txt', 'files/primary/LMMNs_Tenia ChAT NOS DiI MICROGRAPHS/README.txt', 'files/primary/LMMNs_Inter-taenial ChAT NOS DiI/sub-H2100/sam-H2100/SPARC_H2100A_LM_NOS%26ChAT_DiI.xlsx', 'files/primary/LMMNs_Inter-taenial ChAT NOS DiI MICROGRAPHS/sub-H2100/sam-H2100/SPARC_H2100A_LM_MICROGRAPHS/SPARC_H2100A_LM_X-0.303Y-4.213_ChAT', 'files/primary/LMMNs_Inter-taenial ChAT N

Processing datasets:  29%|██▉       | 69/240 [01:04<02:59,  1.05s/it]

  Primary file paths for dataset 126: ['files/primary/Optogenetic_activation/sub-nNOS_Male_Subject_1/sam-190829/19 0829.acq', 'files/primary/Optogenetic_activation/sub-nNOS_Male_Subject_2/sam-190917/19 0917.acq', 'files/primary/Optogenetic_activation/sub-nNOS_Male_Subject_3/sam-191016/19 1016.acq', 'files/primary/Optogenetic_activation/sub-nNOS_Female_Subject_1/sam-191018/19 1018.acq', 'files/primary/Optogenetic_activation/sub-nNOS_Female_Subject_2/sam-200303/20 0303.acq', 'files/primary/Optogenetic_activation/sub-ChAT_Male_Subject_1/sam-200309/20 0309.acq', 'files/primary/Optogenetic_activation/sub-nNOS_Female_Subject_3/sam-200324/20 0324.acq', 'files/primary/Optogenetic_activation/sub-ChAT_Male_Subject_2/sam-200515/20 0515.acq', 'files/primary/Optogenetic_activation/sub-ChAT_Male_Subject_3/sam-200518/20 0518.acq', 'files/primary/Optogenetic_activation/sub-ChAT_Female_Subject_1/sam-200519/20 0519.acq', 'files/primary/Optogenetic_activation/sub-ChAT_Female_Subject_2/sam-200520/20 0520.

Processing datasets:  29%|██▉       | 69/240 [01:05<02:41,  1.06it/s]


KeyboardInterrupt: 

In [None]:
# === Extension Summary ===
tqdm.write("\nAvailable file extensions in 'files/primary/':")
for ext, count in sorted(extension_counter.items(), key=lambda x: -x[1]):
    if count >= 100:  # Only show extensions with significant counts
        tqdm.write(f"{ext}: {count}")

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# Use seaborn for cleaner minimal plots
sns.set_theme(style="white")

# Ensure output directory exists
output_dir = './stats_figures'
os.makedirs(output_dir, exist_ok=True)

# === Example input structure ===
# dataset_paths = [('ds1', ['file1.mat', 'image1.tif']), ('ds2', ['notes.pdf', 'data.csv'])]

# Map extensions to categories, merging Tabular and Structured into Docs
modality_lookup = {
    # Imaging formats
    ".tif": "Imaging", ".tiff": "Imaging", ".czi": "Imaging", ".nd2": "Imaging", ".lsm": "Imaging",
    ".jpx": "Imaging", ".svs": "Imaging", ".ims": "Imaging", ".png": "Imaging", ".jpg": "Imaging",
    ".jpeg": "Imaging", ".bmp": "Imaging", ".vsi": "Imaging", ".jp2": "Imaging", ".roi": "Imaging",
    ".dm3": "Imaging", ".pxp": "Imaging", ".ipf": "Imaging", ".lif": "Imaging", ".ima": "Imaging",
    ".mrxs": "Imaging", ".obj": "Imaging", ".avi": "Imaging", ".exf": "Imaging", ".cxd": "Imaging",

    # Time Series formats
    ".mat": "Time Series", ".smr": "Time Series", ".csv": "Time Series",
    ".adicht": "Time Series", ".hdf5": "Time Series", ".h5": "Time Series", ".ets": "Time Series",
    ".abf": "Time Series", ".rhd": "Time Series", ".nev": "Time Series", ".ns5": "Time Series",
    ".ns2": "Time Series", ".ns1": "Time Series", ".smrx": "Time Series", ".wav": "Time Series",
    ".acq": "Time Series", ".tbk": "Time Series", ".tdx": "Time Series", ".tev": "Time Series",
    ".tin": "Time Series", ".tnt": "Time Series", ".tsq": "Time Series", ".eeg": "Time Series",
    ".vmrk": "Time Series", ".vhdr": "Time Series", ".sev": "Time Series", ".sam": "Time Series",
    ".pss": "Time Series", ".psmethod": "Time Series",

    # Documentation formats
    ".pdf": "Docs", ".docx": "Docs", ".doc": "Docs", ".txt": "Docs",
    ".xlsx": "Docs", ".xls": "Docs", ".tsv": "Docs", ".json": "Docs", 
    ".xml": "Docs", ".db": "Docs", ".xfg": "Docs",
    
    # Other formats
    ".inf": "Other", ".zip": "Other", "": "Other", "(no ext)": "Other",
    ".s2r": "Other", ".ini": "Other", ".cmgui": "Other",
    ".mp4": "Other", ".gz": "Other", ".xlsm": "Other",
    ".db3": "Other", ".ccf": "Other", ".ex": "Other",
    ".conf": "Other", ".rdf": "Other", ".vtk": "Other", ".proj": "Other", ".pnp": "Other",
    ".hoc": "Other", ".fig	": "Other", ".dat": "Other"
}

# === Summary structures ===
summary = {}
modality_extension_counts = defaultdict(Counter)
dataset_category = defaultdict(set)
datasets_with_timeseries = set()

# === Process the dataset file paths ===
for dataset_id, paths in dataset_paths:
    has_ts = False
    has_img = False
    has_docs = False
    for p in paths:
        ext = os.path.splitext(p)[1].lower() or '(no ext)'
        category = modality_lookup.get(ext, 'Other')
        summary[ext] = summary.get(ext, 0) + 1
        modality_extension_counts[category][ext] += 1
        if category == 'Time Series':
            has_ts = True
        elif category == 'Imaging':
            has_img = True
        elif category == 'Documentation':
            has_docs = True
        dataset_category[category].add(dataset_id)
    if has_ts:
        datasets_with_timeseries.add(dataset_id)
    elif has_img:
        dataset_category['Imaging'].add(dataset_id)
    elif has_docs:
        dataset_category['Documentation'].add(dataset_id)
    else:
        dataset_category['Other'].add(dataset_id)


# === Create summaries ===
summary_df = pd.DataFrame.from_dict(summary, orient='index', columns=['File Count'])
summary_df.index.name = 'Extension'
summary_df['Category'] = summary_df.index.map(lambda x: modality_lookup.get(x, 'Other'))
summary_df = summary_df.sort_values('File Count', ascending=False)

category_summary = (
    summary_df[summary_df['Category'] != 'Unknown']
    .groupby('Category')['File Count']
    .sum()
    .sort_values(ascending=False)
)

dataset_series = pd.Series({cat: len(ids) for cat, ids in dataset_category.items() if cat != 'Unknown'}).sort_values(ascending=False)
ts_series = pd.Series(modality_extension_counts['Time Series']).sort_values(ascending=False)
img_series = pd.Series(modality_extension_counts['Imaging']).sort_values(ascending=False)

# === Plot configuration ===
TICK_FS = 10
LABEL_FS = 12

# For a half 16:9 slide (6 x 8 inches)
fig, axes = plt.subplots(4, 1, figsize=(6, 8))

# Y-axis label texts for each subplot
y_labels = ['Files', 'Datasets', 'Time Series Files', 'Imaging Files']
i = 0
for ax, data, ylab in zip(axes, [category_summary, dataset_series, ts_series, img_series], y_labels):
    if i<=1:
        #filter out docs
        data = data[data.index != 'Docs']
    ax.bar(data.index, data.values, color=sns.color_palette('plasma', len(data)))
    ax.tick_params(axis='x', labelsize=TICK_FS)
    ax.tick_params(axis='y', labelsize=TICK_FS)
    ax.set_ylabel(ylab, fontsize=LABEL_FS)
    # Scientific notation
    if i==0:
        ax.ticklabel_format(style='scientific', axis='y', scilimits=(0,0), useMathText=True)
    # Match offset text size
    offset = ax.yaxis.get_offset_text()
    offset.set_fontsize(TICK_FS)
    i += 1

# Rotate x-ticks for last two
for ax in axes[2:]:
    ax.tick_params(axis='x', rotation=45)

sns.despine()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'combined_4x1_summary_minimal_sorted.svg'), format='svg')
plt.show()


# === Word Cloud: File Extension Frequencies ===
from wordcloud import WordCloud

# Filter extension_counter to only include counts ≥ 100
wc_freq = {
    ext.lstrip('.') or 'no_ext': count
    for ext, count in extension_counter.items()
    if count >= 1
}

# Generate word cloud
graph = WordCloud(width=1600, height=800, background_color='white').generate_from_frequencies(wc_freq)

# Plot word cloud
plt.figure(figsize=(5, 2.5))
plt.imshow(graph)
plt.axis('off')
plt.tight_layout()
# Save the word cloud
plt.savefig(os.path.join(output_dir, 'file_extensions_wordcloud.svg'), format='svg')
plt.show()

# print ids per category
print("\nDataset IDs per category:")
print("=== Imaging Datasets ===")
print(f"({len(dataset_category['Imaging'])}): {', '.join(map(str, dataset_category['Imaging']))}")
print("\n=== Time Series Datasets ===")
print(f"({len(dataset_category['Time Series'])}): {', '.join(map(str, dataset_category['Time Series']))}")
print("\n=== Documentation Datasets ===")
print(f"({len(dataset_category['Documentation'])}): {', '.join(map(str, dataset_category['Documentation']))}")
print("\n=== Other Datasets ===")
print(f"({len(dataset_category['Other'])}): {', '.join(map(str, dataset_category['Other']))}")

# print totla number of file formats
print(f"\nTotal number of unique file formats: {len(modality_lookup)}")

### Prepare Figures fo GitHub Readme

In [None]:
TICK_FS, LABEL_FS = 10, 12
fig, axes = plt.subplots(2, 2, figsize=(12, 4))
axes = axes.flatten()                       # ★ flatten fixes the 2×2 bug ★

plots_and_labels = [
    (category_summary[category_summary.index != "Docs"], "Files"),
    (dataset_series[dataset_series.index != "Docs"], "Datasets"),
    (ts_series, "Time Series Files"),
    (img_series, "Imaging Files"),
]

for ax, (data, ylab) in zip(axes, plots_and_labels):
    ax.bar(data.index, data.values, color=sns.color_palette("plasma", len(data)))
    ax.set_ylabel(ylab, fontsize=LABEL_FS)
    ax.tick_params(axis="x", labelsize=TICK_FS)
    ax.tick_params(axis="y", labelsize=TICK_FS)
    if ylab == "Files":
        ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0), useMathText=True)
        ax.yaxis.get_offset_text().set_fontsize(TICK_FS)

for ax in axes[2:]:
    ax.tick_params(axis="x", rotation=45)

sns.despine()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "combined_2x2_summary.png"), format="png")
plt.show()

# ── Word cloud of extensions ──────────────────────────────────────────────────
wc_freq = {
    ext.lstrip(".") or "no_ext": count
    for ext, count in extension_counter.items()
    if count >= 1
}

plt.figure(figsize=(5, 2.5))
WordCloud(width=1600, height=800, background_color="white")\
    .generate_from_frequencies(wc_freq)\
    .to_image()
plt.imshow(WordCloud(width=1600, height=800, background_color="white").generate_from_frequencies(wc_freq))
plt.axis("off")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "file_extensions_wordcloud.svg"), format="svg")
plt.show()

# ── Console summaries ────────────────────────────────────────────────────────
print("\nDataset IDs per category")
for cat in ["Imaging", "Time Series", "Docs", "Other"]:
    ids = sorted(dataset_category[cat])
    print(f"=== {cat} ({len(ids)}) ===")
    print(", ".join(map(str, ids)) or "—")

print(f"\nTotal number of unique file formats: {len(modality_lookup)}")
