# Load MERFISH whole brain spatial transcriptomics



In [3]:
import os
import pandas as pd
import numpy as np
import anndata
import time
import json
import requests
from Utils.Settings import root_data, version

In [5]:
download_base = f'{root_data}/abc_download_root'

use_local_cache = False
manifest_path = 'releases/%s/manifest.json' % version

if not use_local_cache :
    url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/' + manifest_path
    manifest = json.loads(requests.get(url).text)
else :
    file = os.path.join(download_base,manifest_path)
    with open(file,'rb') as f:
        manifest = json.load(f)
    
metadata = manifest['file_listing']['MERFISH-C57BL6J-638850']['metadata']

In [6]:
view_directory = os.path.join( download_base, 
                               manifest['directory_listing']['MERFISH-C57BL6J-638850']['directories']['metadata']['relative_path'], 
                              'views')
cache_views = True
if cache_views :
    os.makedirs( view_directory, exist_ok=True )

In [7]:
rpath = metadata['cell_metadata']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
cell = pd.read_csv(file, dtype={'cell_label':str})
cell.set_index('cell_label',inplace=True)
print(len(cell))

3938808


In [8]:
expression_matrices = manifest['file_listing']['MERFISH-C57BL6J-638850']['expression_matrices']

In [9]:
rpath = expression_matrices['C57BL6J-638850']['log2']['files']['h5ad']['relative_path']
file = os.path.join( download_base, rpath)
print(file)

/alzheimer/Roberto/Allen_Institute/abc_download_root/expression_matrices/MERFISH-C57BL6J-638850/20230830/C57BL6J-638850-log2.h5ad


In [10]:
adata = anndata.read_h5ad(file,backed='r')
gene = adata.var

In [11]:
adata.var

Unnamed: 0_level_0,gene_symbol,transcript_identifier
gene_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000026778,Prkcq,ENSMUST00000028118
ENSMUSG00000026837,Col5a1,ENSMUST00000028280
ENSMUSG00000001985,Grik3,ENSMUST00000030676
ENSMUSG00000039323,Igfbp2,ENSMUST00000047328
ENSMUSG00000048387,Osr1,ENSMUST00000057021
...,...,...
Blank-33,Blank-33,Blank-33
Blank-34,Blank-34,Blank-34
Blank-45,Blank-45,Blank-45
Blank-23,Blank-23,Blank-23


In [12]:
#new
gnames = adata.var.gene_symbol[adata.var.gene_symbol.str.contains("Htr")].values
pred = [x in gnames for x in gene.gene_symbol]
gene_filtered = gene[pred]
gene_filtered

Unnamed: 0_level_0,gene_symbol,transcript_identifier
gene_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000024798,Htr7,ENSMUST00000164639
ENSMUSG00000034997,Htr2a,ENSMUST00000036653
ENSMUSG00000070687,Htr1d,ENSMUST00000088677
ENSMUSG00000049511,Htr1b,ENSMUST00000183482
ENSMUSG00000032269,Htr3a,ENSMUST00000217289


In [10]:
start = time.process_time()
gdata = adata[:,gene_filtered.index].to_df()
print("time taken: ", time.process_time() - start)

time taken:  33.08966260700001


In [11]:
# change columns from index to gene symbol
gdata.columns = gene_filtered.gene_symbol
pred = pd.notna(gdata[gdata.columns[0]])
gdata = gdata[pred].copy(deep=True)
print(len(gdata))

4334174


In [12]:
if cache_views :
    file = os.path.join( view_directory, 'example_genes_all_cells_expression.csv')
    gdata.to_csv( file )

Close h5ad file and clean up

In [13]:
adata.file.close()
del adata

In [14]:
file

'/alzheimer/Roberto/Allen_Institute/abc_download_root/metadata/MERFISH-C57BL6J-638850/20230830/views/example_genes_all_cells_expression.csv'