In [1]:
import gzip
import shutil
import os
import scanpy as sc
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

In [2]:
expression_dir = 'data/cerebellum/SCP795/expression/'
experiment_dict = {
    'mouse_atlas': '5e5e1336771a5b0f0416fb21',
    'mouse_developmental': '60d11736771a5b0b5557da9a'
}

## unzip compressed files

In [3]:
def unzip(experiment, file_name):
    with gzip.open(
        os.path.join(
            expression_dir, experiment_dict[experiment], 
            file_name
        ), 'rb') as f_in:
        with open(
            os.path.join(
                expression_dir, experiment_dict[experiment], 
                file_name.replace('.gz', '')
            ), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [4]:
unzip('mouse_atlas', 'gene_sorted-raw_matrix.mtx.gz')

In [5]:
unzip('mouse_developmental', 'dev_expression_matrix.mtx.gz')
unzip('mouse_developmental', 'dev_barcodes.tsv.gz')
unzip('mouse_developmental', 'dev_genes.tsv.gz')

## Developmental

In [6]:
with open(os.path.join(
            expression_dir, experiment_dict['mouse_developmental'], 
            'dev_barcodes.tsv'
        ), 'r') as f:
    dev_cell_barcodes = [x.replace('\n', '') for x in f.readlines()]

with open(os.path.join(
            expression_dir, experiment_dict['mouse_developmental'], 
            'dev_genes.tsv'
        ), 'r') as f:
    dev_genes = [x.replace('\n', '') for x in f.readlines()]

dev_ann = sc.read_mtx(
    os.path.join(
        expression_dir, experiment_dict['mouse_developmental'], 
        'dev_expression_matrix.mtx')
)

In [7]:
dev_ann = sc.read_mtx(
    os.path.join(
        expression_dir, experiment_dict['mouse_developmental'], 
        'dev_expression_matrix.mtx')
)

In [8]:
dev_ann.var_names = dev_cell_barcodes
dev_ann.obs_names = dev_genes

In [9]:
np.unique(['_'.join(x.split('_')[0:2]) for x in dev_cell_barcodes])

array(['E18_1', 'P0_1', 'P12_1', 'P12_2', 'P16_1', 'P16_2', 'P4_2',
       'P8_2'], dtype='<U5')

## Atlas

In [3]:
atlas_meta = pd.read_csv('data/cerebellum/SCP795/metadata/cb_adult_dev_all_hum_metadata.tsv', sep='\t', low_memory=False)

In [4]:
atlas_meta.columns

Index(['NAME', 'nGene', 'nUMI', 'sex', 'region', 'organ_region',
       'organ_region__ontology_label', 'cluster', 'subcluster',
       'cell_type__custom', 'cell_type', 'cell_type__ontology_label',
       'donor_id', 'biosample_id', 'biosample_type', 'preservation_method',
       'disease', 'disease__ontology_label', 'species',
       'species__ontology_label', 'library_preparation_protocol',
       'library_preparation_protocol__ontology_label', 'organ',
       'organ__ontology_label', 'Age', 'Pseudotime'],
      dtype='object')

In [4]:
atlas_meta['cluster_sex'] = atlas_meta.cluster + '-' + atlas_meta.sex

In [5]:
cluster_dict = {}
for i, r in tqdm(atlas_meta.iterrows()):
    cluster_dict[r['NAME']] = r['cluster']
del cluster_dict['TYPE']
with open('data/cerebellum/SCP795/processed/cell_type_dict.json', 'w') as f:
    json.dump(cluster_dict, f)

cluster_dict_by_sex = {}
for i, r in tqdm(atlas_meta.iterrows()):
    cluster_dict_by_sex[r['NAME']] = r['cluster_sex']
del cluster_dict_by_sex['TYPE']
with open('data/cerebellum/SCP795/processed/cell_type_dict_by_sex.json', 'w') as f:
    json.dump(cluster_dict_by_sex, f)

766812it [00:25, 29627.35it/s]
766812it [00:25, 29551.00it/s]


In [6]:
atlas_ann = sc.read_mtx(
    os.path.join(
        expression_dir, experiment_dict['mouse_atlas'], 
        'gene_sorted-raw_matrix.mtx')
)

In [7]:
with open(os.path.join(
            expression_dir, experiment_dict['mouse_atlas'], 
            'barcodes_cerebellum.tsv'
        ), 'r') as f:
    atlas_cell_barcodes = [x.replace('\n', '') for x in f.readlines()]

with open(os.path.join(
            expression_dir, experiment_dict['mouse_atlas'], 
            'genes_cerebellum.tsv'
        ), 'r') as f:
    atlas_genes = [x.replace('\n', '') for x in f.readlines()]

In [None]:
atlas_cell_types = [cluster_dict[x] for x in atlas_cell_barcodes]
atlas_unique_cell_types = np.unique(atlas_cell_types)
atlas_unique_cell_types

atlas_ann.var_names = atlas_cell_barcodes
atlas_ann.obs_names = atlas_genes
atlas_ann.var['cell_types'] = atlas_cell_types
atlas_ann.var['region'] = [x.split('_')[0] for x in atlas_cell_barcodes]
atlas_ann.var['sample'] = [x.split('_')[1] for x in atlas_cell_barcodes]

for ct in tqdm(atlas_unique_cell_types):
    subset = atlas_ann[:, atlas_ann.var['cell_types'] == ct]
    subset.write_h5ad(f'data/cerebellum/SCP795/processed/{ct}.h5ad')

In [8]:
atlas_cell_types_by_sex = [cluster_dict_by_sex[x] for x in atlas_cell_barcodes]
atlas_unique_cell_types_by_sex = np.unique(atlas_cell_types_by_sex)
atlas_unique_cell_types_by_sex

atlas_ann.var_names = atlas_cell_barcodes
atlas_ann.obs_names = atlas_genes
atlas_ann.var['cell_types_by_sex'] = atlas_cell_types_by_sex
# atlas_ann.var['cell_types'] = atlas_cell_types
atlas_ann.var['region'] = [x.split('_')[0] for x in atlas_cell_barcodes]
atlas_ann.var['sample'] = [x.split('_')[1] for x in atlas_cell_barcodes]

for ct in tqdm(atlas_unique_cell_types_by_sex):
    subset = atlas_ann[:, atlas_ann.var['cell_types_by_sex'] == ct]
    subset.write_h5ad(f'data/cerebellum/SCP795/processed/{ct}.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[k