In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import os
import functools as ft

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc

import celltrip


In [None]:
# TODO: Save preprocessing class


# Reading Files

In [None]:
# Read data
fnames = ['../data/scglue/Chen-2019-RNA.h5ad', '../data/scglue/Chen-2019-ATAC.h5ad']
adatas = celltrip.utility.processing.read_adatas(*fnames, on_disk=True)
celltrip.utility.processing.test_adatas(*adatas)

# Sample data
dataloader = celltrip.utility.processing.PreprocessFromAnnData(*adatas)
modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
# TODO: Make reverse PCA function
adatas, _, adata_vars = dataloader.get_transformables()
# adata_obs = dataloader.preprocessing.subsample(adata_obs=adata_obs, partition_cols=dataloader.partition_cols)
sing_modalities, _ = dataloader.preprocessing.transform(adatas[0][adata_obs[0].index].X, adata_vars=adata_vars[0], subset_features=[0], subset_modality=0)
inv_modalities = dataloader.preprocessing.inverse_transform(modalities[0]-sing_modalities[0], subset_modality=0)
orig_modalities = dataloader.preprocessing.inverse_transform(
    dataloader.preprocessing.transform(adatas[0][adata_obs[0].index].X, subset_modality=0))
orig_modalities[0][:, dataloader.preprocessing.filter_mask[0]] - inv_modalities[0]


  self._set_arrayXarray(i, j, x)


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')


Sampling takes ~5.35 seconds


# Reading Large Files

In [None]:
# Read data
fnames = [f'../data/tahoe/plate{i}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad' for i in range(1, 15)]
partition_cols = ['sample', 'plate']
adatas = celltrip.utility.processing.read_adatas(*fnames[:2], on_disk=True)
adatas = [ad.experimental.AnnCollection(adatas)]  # NOTE: Use concat if in memory
adatas[0].var = adatas[0].adatas[0].var  # Add var dataframe to `AnnCollection` object
celltrip.utility.processing.test_adatas(*adatas)

# Sample data
dataloader = celltrip.utility.processing.PreprocessFromAnnData(
    *adatas, partition_cols=partition_cols)
modalities, adata_obs, adata_vars = dataloader.sample()


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')


Sampling takes ~29.49 seconds


# Formatting CSV Files

In [None]:
# Prerequisites
fnames = ['../data/MERFISH/s3_mapped_cell_table.csv', '../data/MERFISH/s3_cell_by_gene.csv']
outfiles = ['../data/MERFISH/spatial.h5ad', '../data/MERFISH/expression.h5ad']
partition_cols = 'experiment'
adatas = []

# Spatial
fname = fnames[0]
df = pd.read_csv(fname, index_col=0, header=0).set_index('sample_name')
df_obs = df[['area', 'experiment', 'layer']]
df = df[['xc_adjusted', 'yc_adjusted']]
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[0])

# Gene expression
fname = fnames[1]
df = pd.read_csv(fname, index_col=0, header=1)
df.index.name = 'sample_name'
adata = ad.AnnData(df, obs=df_obs)
adata.write_h5ad(outfiles[1])




In [None]:
# Read
fnames = ['../data/MERFISH/expression.h5ad', '../data/MERFISH/spatial.h5ad']
partition_cols = 'layer'
adatas = celltrip.utility.processing.read_adatas(*fnames, on_disk=False)
celltrip.utility.processing.test_adatas(*adatas, partition_cols=partition_cols)

# Dataloader
dataloader = celltrip.utility.processing.PreprocessFromAnnData(
    *adatas, partition_cols=partition_cols, pca_dim=128)
modalities, adata_obs, adata_vars = dataloader.sample()


  svd_solver='auto',


In [None]:
# Perform gene knockdown in processed data
modality_to_test = 0
features_to_test = [200, 150]
adatas, _, adata_vars = dataloader.get_transformables()
iso_modality = dataloader.preprocessing.transform(
    adatas[modality_to_test][adata_obs[modality_to_test].index].X,
    force_filter=True,
    subset_features=features_to_test,
    subset_modality=modality_to_test)[0]
knocked_modality = modalities[modality_to_test] - iso_modality

# Verify sure the knockdown works
orig_modality, = dataloader.preprocessing.inverse_transform(modalities[modality_to_test], subset_modality=modality_to_test)
inv_modality, = dataloader.preprocessing.inverse_transform(knocked_modality, subset_modality=modality_to_test)
change = np.abs(orig_modality - inv_modality).sum(axis=0) / dataloader.preprocessing.standardize_std[modality_to_test]
most_changed_idx = dataloader.preprocessing.filter_mask[modality_to_test][np.argsort(change).flatten()[-len(features_to_test):]]
print(f'Targets: {np.array(features_to_test)}, Most Changed: {most_changed_idx}, should be the same elements')


Targets: [200 150], Most Changed: [150 200], should be the same elements


In [None]:
import time
start = time.perf_counter()
for _ in range(10): dataloader.sample()
print(f'Sampling takes ~{(time.perf_counter()-start)/10:.2f} seconds')


Sampling takes ~0.00 seconds
