In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import functools as ft

import anndata as ad
import numpy as np
import pandas as pd
import s3fs
import scanpy as sc

import celltrip

os.environ['AWS_PROFILE'] = 'waisman-admin'
s3 = s3fs.S3FileSystem(skip_instance_cache=True)


# Reading Files

In [None]:
# Read data
fnames = ['../data/scglue/Chen-2019-RNA.h5ad', '../data/scglue/Chen-2019-ATAC.h5ad']
adatas = celltrip.utility.processing.read_adatas(*fnames, backed=True)
celltrip.utility.processing.test_adatas(*adatas)

# Sample data
dataloader = celltrip.utility.processing.PreprocessFromAnnData(*adatas)
modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')


# Reading Large Files

In [None]:
# Read data
# NOTE: Each file still takes >1Gb for whatever reason
fnames = [f'../data/tahoe/plate{i}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad' for i in range(1, 15)][:4]
partition_cols = ['sample', 'plate']
adatas = celltrip.utility.processing.read_adatas(*fnames, backed=True)
adatas = [celltrip.utility.processing.merge_adatas(*adatas, backed=True)]
celltrip.utility.processing.test_adatas(*adatas)

# Sample data
dataloader = celltrip.utility.processing.PreprocessFromAnnData(
    *adatas, partition_cols=partition_cols)
modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')


# Formatting CSV Files

In [None]:
# Prerequisites
fnames = ['../data/MERFISH/s3_mapped_cell_table.csv', '../data/MERFISH/s3_cell_by_gene.csv']
outfiles = ['../data/MERFISH/spatial.h5ad', '../data/MERFISH/expression.h5ad']

# Spatial
fname = fnames[0]
df = pd.read_csv(fname, index_col=0, header=0).set_index('sample_name')
df_obs = df[['area', 'experiment', 'layer']]
df = df[['xc_adjusted', 'yc_adjusted']]
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[0])

# Gene expression
fname = fnames[1]
df = pd.read_csv(fname, index_col=0, header=1)
df.index.name = 'sample_name'
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[1])


In [None]:
# Read
fnames = ['../data/MERFISH/expression.h5ad', '../data/MERFISH/spatial.h5ad']
partition_cols = 'layer'
adatas = celltrip.utility.processing.read_adatas(*fnames, backed=False)
celltrip.utility.processing.test_adatas(*adatas, partition_cols=partition_cols)

# Dataloader
dataloader = celltrip.utility.processing.PreprocessFromAnnData(
    *adatas, partition_cols=partition_cols, pca_dim=128)  # , mask=.8, seed=42
modalities, adata_obs, adata_vars = dataloader.sample()


# Other Files

In [None]:
# Temporal Brain
import rds2py
fnames = ['../data/temporalBrain/GSE204683_count_matrix.RDS', '../data/temporalBrain/GSE204682_count_matrix.RDS']
barcodes = ['../data/temporalBrain/GSE204683_barcodes.tsv', '../data/temporalBrain/GSE204682_barcodes.tsv']
outfiles = ['../data/temporalBrain/expression.h5ad', '../data/temporalBrain/peaks.h5ad']

# Load RNA
rdata = rds2py.parse_rds(fnames[0])
rdata['attributes']['dimnames'] = rdata['attributes']['Dimnames']
wrapper = rds2py.generics._dispatcher(rdata)
M1, F1, C1 = wrapper.matrix.T, *[np.array(sl) for sl in wrapper.dimnames]
B1 = pd.read_csv(barcodes[0], delimiter='\t')

# Load ATAC
rdata = rds2py.parse_rds(fnames[1])
rdata['attributes']['dimnames'] = rdata['attributes']['Dimnames']
wrapper = rds2py.generics._dispatcher(rdata)
M2, F2, C2 = wrapper.matrix.T, *[np.array(sl) for sl in wrapper.dimnames]
B2 = pd.read_csv(barcodes[1], delimiter='\t')

# Get sample ids
uniq_col, count_col = np.unique([e.split('_')[0] for e in C1], return_counts=True)
assert np.unique(count_col, return_counts=True)[1].max() == 1  # No duplicate counts, otherwise manual annotation is needed
# Get donor ids
uniq_donor, count_donor = np.unique(B1['Donor ID'], return_counts=True)
assert (np.sort(count_col) == np.sort(count_donor)).all()
# Convert (aided by `preprocessing.R`)
name_to_id = {d: c for c, d in zip(uniq_col[np.argsort(count_col)], uniq_donor[np.argsort(count_donor)])}
# Set indices
B1['Cell ID'] = B1.apply(lambda r: f'{name_to_id[r["Donor ID"]]}_{r["Barcode"]}', axis=1)
B2['Cell ID'] = B2.apply(lambda r: f'{name_to_id[r["Donor ID"]]}_{r["Barcode"]}', axis=1)
assert (B1.set_index('Cell ID') == B1.set_index('Cell ID').loc[C1]).all().all()  # For some reason, `barcodes2` doesn't line up with `C2`, so we assume both meta are the correct order
B2['Cell ID'] = B1['Cell ID']  # Set Cell IDs to be the same

# RNA AnnData
adata = ad.AnnData(M1, obs=B1.set_index('Cell ID').loc[C1])
adata.var_names = F1
adata.write_h5ad(outfiles[0])

# ATAC AnnData
adata = ad.AnnData(M2, obs=B2.set_index('Cell ID'))
adata.var_names = F2
adata.write_h5ad(outfiles[1])


In [None]:
# Flysta3D
periods = ['E14-16h_a', 'E16-18h_a', 'L1_a', 'L2_a', 'L3_b']  # [4:]
fnames = [f'../data/Flysta3D/{p}_count_normal_stereoseq.h5ad' for p in periods]
adatas_rna = celltrip.utility.processing.read_adatas(*fnames, backed=True)
# Annotate development
for p, adata in zip(periods, adatas_rna):
    adata.obs['development'] = p
# Resave expression adatas
# NOTE: Flysta doesn't have `encoding-type` under attributes for whatever reason, which necessitates this
for p, adata in zip(periods, adatas_rna):
    adata = ad.AnnData(adata.X, obs=adata.obs, var=adata.var)
    adata.write_h5ad(f'../data/Flysta3D/{p}_expression.h5ad')
# Create spatial adatas
adatas_spatial = [ad.AnnData(adata.obsm['spatial'], obs=adata.obs) for adata in adatas_rna]
for p, adata in zip(periods, adatas_spatial):
    adata.write_h5ad(f'../data/Flysta3D/{p}_spatial.h5ad')

# Test
# periods = ['E14-16h_a', 'E16-18h_a', 'L1_a', 'L2_a', 'L3_b']
# fnames_1 = [f'../data/Flysta3D/{p}_expression.h5ad' for p in periods]
# fnames_2 = [f'../data/Flysta3D/{p}_spatial.h5ad' for p in periods]
# adatas_rna = celltrip.utility.processing.read_adatas(*fnames_1, backed=True)
# adatas_spatial = celltrip.utility.processing.read_adatas(*fnames_2, backed=True)
# adatas_rna = celltrip.utility.processing.merge_adatas(*adatas_rna, backed=True)
# adatas_spatial = celltrip.utility.processing.merge_adatas(*adatas_spatial, backed=True)
# adatas = [adatas_rna, adatas_spatial]
# dataloader = celltrip.utility.processing.PreprocessFromAnnData(*adatas, partition_cols='slice_ID')
# modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
# scMultiSim
fnames = ['../data/scMultiSim/scMultiSim_RNA_counts_1250_genes.csv', '../data/scMultiSim/scMultiSim_ATAC_seq_1250_genes_new.csv']
fname_meta = '../data/scMultiSim/cell_meta_1250_genes.csv'
outfiles = ['../data/scMultiSim/expression.h5ad', '../data/scMultiSim/peaks.h5ad']
meta = pd.read_csv(fname_meta, index_col=0).T
for fname, out_fname in zip(fnames, outfiles):
    X = pd.read_csv(fnames[0])
    adata = ad.AnnData(X)
    adata.obs = meta
    adata.write_h5ad(out_fname)

# Test
# fnames = ['../data/scMultiSim/expression.h5ad', '../data/scMultiSim/peaks.h5ad']
# adatas = celltrip.utility.processing.read_adatas(*fnames, backed=True)
# dataloader = celltrip.utility.processing.PreprocessFromAnnData(*adatas)
# modalities, adata_obs, adata_vars = dataloader.sample()

In [None]:
# MERFISH CMAP Benchmark
fnames = ['../data/MERFISH_Bench/sim.cmap.spatial_location.csv', '../data/MERFISH_Bench/sim.cmap.spatial_count.csv']
outfiles = ['../data/MERFISH_Bench/spatial.h5ad', '../data/MERFISH_Bench/expression.h5ad']
adatas = []

# Spatial
fname = fnames[0]
df = pd.read_csv(fname)
df_obs = df[['pattern_gp_label', 'x_round', 'y_round', 'HMRF_k3_b.40']]
df = df[['x', 'y']]
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[0])

# Expression
fname = fnames[1]
df = pd.read_csv(fname).T
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[1])

# Test
# fnames = ['../data/MERFISH_Bench/expression.h5ad', '../data/MERFISH_Bench/spatial.h5ad']
# partition_cols = None
# adatas = celltrip.utility.processing.read_adatas(*fnames, backed=False)
# celltrip.utility.processing.test_adatas(*adatas, partition_cols=partition_cols)
# dataloader = celltrip.utility.processing.PreprocessFromAnnData(
#     *adatas, partition_cols=partition_cols, pca_dim=128)  # , mask=.8, seed=42
# modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
# dyngen simulation data
fnames = ['../data/dyngen/dyngen_sim.h5ad']
outfile_prefix = '../data/dyngen/'
outfiles = []

# Load and separate layers
adata, = celltrip.utility.processing.read_adatas(*fnames, backed=True)
for layer in adata.layers:
    outfile = os.path.join(outfile_prefix, f'{layer}.h5ad')
    new_adata = ad.AnnData(adata.layers[layer], obs=adata.obs, var=adata.var)
    new_adata.write_h5ad(outfile)
    outfiles.append(outfile)
print(outfiles)

In [None]:
# CancerVel simulation data
fname = '../data/CancerVel/K562_cancer_data.h5ad'
outfile = '../data/CancerVel/expression.h5ad'

# Load and transpose
adata, = celltrip.utility.processing.read_adatas(fname)  # Must be read into memory for transpose
new_adata = adata.T
for col in ['sgAssign', 'sgAssign2', 'sgAssign3', 'sgAssignNew']:  # NA values are not supported for partitions
    new_adata.obs[col] = new_adata.obs[col].cat.add_categories('None').fillna('None')
new_adata.write_h5ad(outfile)

In [None]:
# MERFISH30k simulation data
fname = '../data/MERFISH30k/comb.h5ad'
outfile = '../data/MERFISH30k/spatial.h5ad'
outfiles = []

# Load and separate layers
adata, = celltrip.utility.processing.read_adatas(fname, backed=True)
spatial_adata = ad.AnnData(X=adata.obs[['x', 'y']], obs=adata.obs)
spatial_adata.write_h5ad(outfile)

# Upload to s3

In [None]:
# Aggregate files to upload
fnames = []; folders = []

# scGLUE
# fnames += ['../data/scglue/Chen-2019-RNA.h5ad', '../data/scglue/Chen-2019-ATAC.h5ad']
# folders += len(fnames)*['scGLUE']
# partition_cols = None

# MERFISH
# fnames += ['../data/MERFISH/expression.h5ad', '../data/MERFISH/spatial.h5ad']
# folders += len(fnames)*['MERFISH']
# partition_cols = None

# Temporal Brain ('Donor ID')
# fnames += ['../data/temporalBrain/expression.h5ad', '../data/temporalBrain/peaks.h5ad']
# folders += len(fnames)*['TemporalBrain']
# partition_cols = None

# TAHOE-100M ('sample')
# fnames = [f'../data/tahoe/plate{i}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad' for i in range(1, 15)]
# folders += len(fnames)*['Tahoe']
# partition_cols = None

# Flysta3D ('slice_ID')
# periods = ['E14-16h_a', 'E16-18h_a', 'L1_a', 'L2_a', 'L3_b']
# fnames += (
#     # [f'../data/Flysta3D/{p}_count_normal_stereoseq.h5ad' for p in periods]
#     [f'../data/Flysta3D/{p}_expression.h5ad' for p in periods]
#     + [f'../data/Flysta3D/{p}_spatial.h5ad' for p in periods])
# folders += len(fnames)*['Flysta3D']

# scMultiSim
# fnames += ['../data/scMultiSim/expression.h5ad', '../data/scMultiSim/peaks.h5ad']
# folders += len(fnames)*['scMultiSim']

# MERFISH CMAP Benchmark
# fnames += ['../data/MERFISH_Bench/expression.h5ad', '../data/MERFISH_Bench/spatial.h5ad']
# folders += len(fnames)*['MERFISH_Bench']

# Virtual Cell Challenge
# fnames += ['../data/VirtualCell/vcc_flt_data.h5ad']
# folders += len(fnames)*['VirtualCell']

# dyngen
# fnames += [
#     '../data/dyngen/counts_protein.h5ad', '../data/dyngen/counts_spliced.h5ad',
#     '../data/dyngen/counts_unspliced.h5ad', '../data/dyngen/logcounts.h5ad',
#     '../data/dyngen/rna_velocity.h5ad']
# folders += len(fnames)*['dyngen']

# CancerVel
# fnames += ['../data/CancerVel/expression.h5ad']
# folders += len(fnames)*['CancerVel']

# MERFISH30k
# fnames += ['../data/MERFISH30k/comb.h5ad', '../data/MERFISH30k/spatial.h5ad']
# folders += len(fnames)*['MERFISH30k']

# Upload
for fname, folder in zip(fnames, folders): s3.put(fname, os.path.join(os.path.join('s3://nkalafut-celltrip', folder), os.path.basename(fname)))


# Perturb or Knock Features in Processed Space

In [None]:
# Perform gene knockdown in processed data
modality_to_test = 0
features_to_test = [200, 150]
adatas, _, adata_vars = dataloader.get_transformables()
iso_modality = dataloader.preprocessing.transform(
    adatas[modality_to_test][adata_obs[modality_to_test].index].X,
    # force_filter=True,
    subset_features=features_to_test,
    subset_modality=modality_to_test)[0]
knocked_modality = modalities[modality_to_test] - iso_modality

# Verify the knockdown works
orig_modality, = dataloader.preprocessing.inverse_transform(modalities[modality_to_test], subset_modality=modality_to_test)
inv_modality, = dataloader.preprocessing.inverse_transform(knocked_modality, subset_modality=modality_to_test)
change = np.abs(orig_modality - inv_modality).sum(axis=0) / dataloader.preprocessing.standardize_std[modality_to_test]
most_changed_idx = dataloader.preprocessing.filter_mask[modality_to_test][np.argsort(change).flatten()[-len(features_to_test):]]
print(f'Targets: {np.array(features_to_test)}, Most Changed: {most_changed_idx}, should be the same elements')


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')
