# Load MERFISH data 

In [1]:
import requests
import json
import os
os.chdir("..")
import pathlib
import subprocess
import time
import pandas as pd
from tqdm.notebook import tqdm
from Utils.Settings import root_data, version

## Using the file manifest

Let's open the manifest.json file associated with the current release.

In [2]:

url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/releases/%s/manifest.json' % version
manifest = json.loads(requests.get(url).text)
print("version: ", manifest['version'])

version:  20230830


At the top level, the manifest consists of the release *version* tag, S3 *resource_uri*,  dictionaries *directory_listing* and *file_listing*. A simple option to download data is to use the AWS CLI to download specific directories or files. All the example notebooks in this repository assumes that data has been downloaded locally in the same file organization as specified by the "relative_path" field in the manifest.

In [3]:
manifest.keys()
print("version:",manifest['version'])
print("resource_uri:",manifest['resource_uri'])

version: 20230830
resource_uri: s3://allen-brain-cell-atlas/


Let's look at the information associated with the spatial transcriptomics dataset **MERFISH-C57BL6J-638850**. This dataset has two related directories: *expression_matrices* containing a set of h5ad files and *metadata* containing a set of csv files. Use the *view_link* url to browse the directories on a web-browser.

In [4]:
expression_matrices = manifest['directory_listing']['MERFISH-C57BL6J-638850']['directories']['expression_matrices']
print(expression_matrices)
print(expression_matrices['view_link'])

{'version': '20230830', 'relative_path': 'expression_matrices/MERFISH-C57BL6J-638850/20230830', 'url': 'https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/expression_matrices/MERFISH-C57BL6J-638850/20230830/', 'view_link': 'https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/index.html#expression_matrices/MERFISH-C57BL6J-638850/20230830/', 'total_size': 15255179148}
https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/index.html#expression_matrices/MERFISH-C57BL6J-638850/20230830/


In [5]:
metadata = manifest['directory_listing']['MERFISH-C57BL6J-638850']['directories']['metadata']
print(metadata)
print(metadata['view_link'])

{'version': '20230830', 'relative_path': 'metadata/MERFISH-C57BL6J-638850/20230830', 'url': 'https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/metadata/MERFISH-C57BL6J-638850/20230830/', 'view_link': 'https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/index.html#metadata/MERFISH-C57BL6J-638850/20230830/', 'total_size': 1942603772}
https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/index.html#metadata/MERFISH-C57BL6J-638850/20230830/


Directory sizes are also reported as part to the manifest.json. WARNING: the expression matrices directories can get very large > 100 GB.

In [6]:
GB = float(float(1024) ** 3)

for r in manifest['directory_listing'] :    
    r_dict =  manifest['directory_listing'][r]
    for d in r_dict['directories'] :
        d_dict = r_dict['directories'][d]        
        print(d_dict['relative_path'],":",'%0.2f GB' % (d_dict['total_size']/GB))
        

expression_matrices/MERFISH-C57BL6J-638850/20230830 : 14.21 GB
metadata/MERFISH-C57BL6J-638850/20230830 : 1.81 GB
expression_matrices/MERFISH-C57BL6J-638850-sections/20230630 : 14.31 GB
expression_matrices/WMB-10Xv2/20230630 : 104.16 GB
expression_matrices/WMB-10Xv3/20230630 : 176.41 GB
expression_matrices/WMB-10XMulti/20230830 : 0.21 GB
metadata/WMB-10X/20230830 : 2.39 GB
metadata/WMB-taxonomy/20230830 : 0.01 GB
metadata/WMB-neighborhoods/20230830 : 3.00 GB
image_volumes/Allen-CCF-2020/20230630 : 0.37 GB
metadata/Allen-CCF-2020/20230630 : 0.00 GB
image_volumes/MERFISH-C57BL6J-638850-CCF/20230630 : 0.11 GB
metadata/MERFISH-C57BL6J-638850-CCF/20230830 : 2.01 GB
expression_matrices/Zhuang-ABCA-1/20230830 : 3.09 GB
metadata/Zhuang-ABCA-1/20230830 : 1.33 GB
metadata/Zhuang-ABCA-1-CCF/20230830 : 0.21 GB
expression_matrices/Zhuang-ABCA-2/20230830 : 1.30 GB
metadata/Zhuang-ABCA-2/20230830 : 0.57 GB
metadata/Zhuang-ABCA-2-CCF/20230830 : 0.08 GB
expression_matrices/Zhuang-ABCA-3/20230830 : 1.69

## Downloading files for the tutorial notebooks

Suppose you would like to download data to your local path *../abc_download_root*.

In [7]:
download_base = f'{root_data}/abc_download_root'

### Downloading all metadata directories

Since the metadata directories are relatively small we will download all the metadata directories. We loop through the manifest and download each metadata directory using  **[AWS CLI](https://aws.amazon.com/cli/)** sync command. This should take < 5 minutes.

In [21]:
for r in manifest['directory_listing'] :
    
    r_dict =  manifest['directory_listing'][r]
    
    for d in r_dict['directories'] :
        
        if d != 'metadata' :
            continue
        d_dict = r_dict['directories'][d]
        local_path = os.path.join( download_base, d_dict['relative_path'])
        local_path = pathlib.Path( local_path )
        remote_path = manifest['resource_uri'] + d_dict['relative_path']
        
        command = "/alzheimer/Roberto/bin/aws s3 sync --no-sign-request %s %s" % (remote_path, local_path)
        print(command)
        
        start = time.process_time()
        # Uncomment to download directories
        result = subprocess.run(command.split(),stdout=subprocess.PIPE)
        print("time taken: ", time.process_time() - start)
  

/alzheimer/Roberto/bin/aws s3 sync --no-sign-request s3://allen-brain-cell-atlas/metadata/MERFISH-C57BL6J-638850/20230830 /alzheimer/Roberto/Allen_Institute/abc_download_root/metadata/MERFISH-C57BL6J-638850/20230830


KeyboardInterrupt: 

### Downloading one 10x expression matrix
The prerequisite to run the 10x part 1 notebook is to have downloaded the log2 version of the "'WMB-10Xv2-TH'" matrix (4GB). Download takes ~ 1 min depending on your network speed. 

We define a simple helper function to create the require AWS command. You can copy the command into a terminal shell to run or optionally run it inside this notebook if you uncomment the "subprocess.run" line of code.

In [None]:
def download_file( file_dict ) :
    
    print(file_dict['relative_path'],file_dict['size'])
    local_path = os.path.join( download_base, file_dict['relative_path'] )
    local_path = pathlib.Path( local_path )
    remote_path = manifest['resource_uri'] + file_dict['relative_path']

    command = "/alzheimer/Roberto/bin/aws s3 cp --no-sign-request %s %s" % (remote_path, local_path)
    print(command)

    start = time.process_time()
    # Uncomment to download file
    result = subprocess.run(command.split(' '),stdout=subprocess.PIPE)
    print("time taken: ", time.process_time() - start)

In [None]:
expression_matrices = manifest['file_listing']['WMB-10Xv2']['expression_matrices']
file_dict = expression_matrices['WMB-10Xv2-Isocortex-3']['log2']['files']['h5ad']
print('size:',file_dict['size'])
download_file( file_dict )

### Downloading all matrices

In [31]:
version = '20230830'
download_base = '/alzheimer/Roberto/Allen_Institute/abc_download_root'

use_local_cache = False
manifest_path = 'releases/%s/manifest.json' % version

if not use_local_cache :
    url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/' + manifest_path
    manifest = json.loads(requests.get(url).text)
else :
    file = os.path.join(download_base,manifest_path)
    with open(file,'rb') as f:
        manifest = json.load(f)
    
metadata = manifest['file_listing']['WMB-10X']['metadata']

In [32]:
view_directory = os.path.join( download_base, 
                               manifest['directory_listing']['WMB-10X']['directories']['metadata']['relative_path'], 
                              'views')
cache_views = False
if cache_views :
    os.makedirs( view_directory, exist_ok=True )

In [33]:
rpath = metadata['cell_metadata']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
cell = pd.read_csv(file)
cell.set_index('cell_label',inplace=True)

In [34]:
matrices = cell.groupby(['dataset_label','feature_matrix_label'])[['library_label']].count()
matrices.columns  = ['cell_count']
matrices

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_count
dataset_label,feature_matrix_label,Unnamed: 2_level_1
WMB-10XMulti,WMB-10XMulti,1687
WMB-10Xv2,WMB-10Xv2-CTXsp,43985
WMB-10Xv2,WMB-10Xv2-HPF,207281
WMB-10Xv2,WMB-10Xv2-HY,99879
WMB-10Xv2,WMB-10Xv2-Isocortex-1,248776
WMB-10Xv2,WMB-10Xv2-Isocortex-2,249360
WMB-10Xv2,WMB-10Xv2-Isocortex-3,249356
WMB-10Xv2,WMB-10Xv2-Isocortex-4,248784
WMB-10Xv2,WMB-10Xv2-MB,29781
WMB-10Xv2,WMB-10Xv2-OLF,192182


In [35]:
matrices.index 

MultiIndex([('WMB-10XMulti',          'WMB-10XMulti'),
            (   'WMB-10Xv2',       'WMB-10Xv2-CTXsp'),
            (   'WMB-10Xv2',         'WMB-10Xv2-HPF'),
            (   'WMB-10Xv2',          'WMB-10Xv2-HY'),
            (   'WMB-10Xv2', 'WMB-10Xv2-Isocortex-1'),
            (   'WMB-10Xv2', 'WMB-10Xv2-Isocortex-2'),
            (   'WMB-10Xv2', 'WMB-10Xv2-Isocortex-3'),
            (   'WMB-10Xv2', 'WMB-10Xv2-Isocortex-4'),
            (   'WMB-10Xv2',          'WMB-10Xv2-MB'),
            (   'WMB-10Xv2',         'WMB-10Xv2-OLF'),
            (   'WMB-10Xv2',          'WMB-10Xv2-TH'),
            (   'WMB-10Xv3',          'WMB-10Xv3-CB'),
            (   'WMB-10Xv3',       'WMB-10Xv3-CTXsp'),
            (   'WMB-10Xv3',         'WMB-10Xv3-HPF'),
            (   'WMB-10Xv3',          'WMB-10Xv3-HY'),
            (   'WMB-10Xv3', 'WMB-10Xv3-Isocortex-1'),
            (   'WMB-10Xv3', 'WMB-10Xv3-Isocortex-2'),
            (   'WMB-10Xv3',          'WMB-10Xv3-MB'),
          

In [37]:
for a,b in matrices.index:
    expression_matrices = manifest['file_listing'][a]['expression_matrices']
    file_dict = expression_matrices[b]['log2']['files']['h5ad']
    print(a, b,'size:',file_dict['size'])
    download_file( file_dict )

WMB-10XMulti WMB-10XMulti size: 89318511
WMB-10Xv2 WMB-10Xv2-CTXsp size: 1740441622
WMB-10Xv2 WMB-10Xv2-HPF size: 6096269724
WMB-10Xv2 WMB-10Xv2-HY size: 2908443982
WMB-10Xv2 WMB-10Xv2-Isocortex-1 size: 8601133978
WMB-10Xv2 WMB-10Xv2-Isocortex-2 size: 9444387082
WMB-10Xv2 WMB-10Xv2-Isocortex-3 size: 8457819034
WMB-10Xv2 WMB-10Xv2-Isocortex-4 size: 8692589466
WMB-10Xv2 WMB-10Xv2-MB size: 817433734
WMB-10Xv2 WMB-10Xv2-OLF size: 5128120156
WMB-10Xv2 WMB-10Xv2-TH size: 4038679930
WMB-10Xv3 WMB-10Xv3-CB size: 5610691342
WMB-10Xv3 WMB-10Xv3-CTXsp size: 3277343842
WMB-10Xv3 WMB-10Xv3-HPF size: 7409633208
WMB-10Xv3 WMB-10Xv3-HY size: 7248338584
WMB-10Xv3 WMB-10Xv3-Isocortex-1 size: 11768194128
WMB-10Xv3 WMB-10Xv3-Isocortex-2 size: 8356210362
WMB-10Xv3 WMB-10Xv3-MB size: 13726487690
WMB-10Xv3 WMB-10Xv3-MY size: 7206054638
WMB-10Xv3 WMB-10Xv3-OLF size: 3114998442
WMB-10Xv3 WMB-10Xv3-P size: 5200570200
WMB-10Xv3 WMB-10Xv3-PAL size: 4067049816
WMB-10Xv3 WMB-10Xv3-STR size: 11915297204
WMB-10Xv3 WM

### Downloading the MERFISH expression matrix

The prerequisite to run the MERFISH part 1 notebook is to have downloaded the log2 version of the "C57BL6J-638850" matrix (7GB). Download takes ~3 mins depending on tour network speed.

In [None]:
datasets = ['MERFISH-C57BL6J-638850']
for d in datasets :
    expression_matrices = manifest['file_listing'][d]['expression_matrices']
    file_dict = expression_matrices['C57BL6J-638850']['log2']['files']['h5ad']
    print('size:',file_dict['size'])
    download_file( file_dict )

The prerequisite to run the Zhuang MERFISH notebook is to have downloaded the log2 version of the expression matrices of all 4 brain specimens

In [None]:
datasets = ['Zhuang-ABCA-1','Zhuang-ABCA-2','Zhuang-ABCA-3','Zhuang-ABCA-4']
for d in datasets :
    expression_matrices = manifest['file_listing'][d]['expression_matrices']
    file_dict = expression_matrices[d]['log2']['files']['h5ad']
    print('size:',file_dict['size'])
    download_file( file_dict )

### Downloading all image volumes

The prerequisite to run the CCF and MERFISH to CCF registration notebooks is to have downloaded the two set of image volumes.

In [None]:
for r in manifest['directory_listing'] :
    
    r_dict =  manifest['directory_listing'][r]
    
    for d in r_dict['directories'] :
        
        if d != 'image_volumes' :
            continue
        d_dict = r_dict['directories'][d]
        local_path = os.path.join( download_base, d_dict['relative_path'])
        local_path = pathlib.Path( local_path )
        remote_path = manifest['resource_uri'] + d_dict['relative_path']
        
        command = "/alzheimer/Roberto/bin/aws s3 sync --no-sign-request %s %s" % (remote_path, local_path)
        print(command)
        
        start = time.process_time()
        # Uncomment to download directories
        result = subprocess.run(command.split(),stdout=subprocess.PIPE)
        print("time taken: ", time.process_time() - start)
  