In [1]:
from collections import defaultdict
import itertools
import os
import sys

import h5py
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse
import torch
import wandb

import celltrip

os.environ['AWS_PROFILE'] = 'waisman-admin'

# Data Setup

In [None]:
# Run name
run_name = 'Flysta3D_L3_a'

# Policy and Environment
train_split = .8
train_partitions = False
input_modalities = None
target_modalities = [1]
dim = 8

# Data
input_files = []
merge_files = [[f's3://nkalafut-celltrip/Flysta3D/{p}_{m}.h5ad' for p in ('L2_a',)] for m in ('expression', 'spatial')]
backed = True
partition_cols = ['development']  # slice_ID
type_key = 'annotation'  # MERFISH ('layer'), scGLUE ('cell_type'), scMultiSim ('cell.type'), Flysta3D ('annotation'), TemporalBrain ('Cell type')


In [None]:
# # Run name
# run_name = 'MERFISH'

# # Policy and Environment
# train_split = .8
# train_partitions = False
# input_modalities = None
# target_modalities = [1]
# dim = 8

# # Data
# input_files = ['s3://nkalafut-celltrip/MERFISH/expression.h5ad', 's3://nkalafut-celltrip/MERFISH/spatial.h5ad']
# merge_files = []
# backed = True
# partition_cols = None
# type_key = 'layer'  # MERFISH ('layer'), scGLUE ('cell_type'), scMultiSim ('cell.type'), Flysta3D ('annotation'), TemporalBrain ('Cell type')


In [4]:
# Seeding
torch.random.manual_seed(42)
np.random.seed(42)

# Initialize locally
env_init, policy_init, memory_init = celltrip.train.get_initializers(
    input_files=input_files, merge_files=merge_files, backed=backed,
    partition_cols=partition_cols, dataloader_kwargs={'mask': train_split, 'mask_partitions': train_partitions, 'num_nodes': 1_500},  # TODO: Subsampling, maybe remove initial env.reset()
    environment_kwargs={'input_modalities': input_modalities, 'target_modalities': target_modalities, 'dim': dim})  # , 'spherical': discrete
env = env_init().to('cuda')

# Store mask for later use
full_train_mask = env.dataloader.mask
env.dataloader.mask = None

# Directories
BASE_FOLDER = os.path.abspath('')
DATA_FOLDER = os.path.join(BASE_FOLDER, '../data')
RUN_FOLDER = os.path.join(BASE_FOLDER, 'runs', run_name)




In [5]:
# env.reset(partition=('L2_a',))
env.reset(partition=None)


In [6]:
# # Parameters
# # rypltvk5 (ts), 32jqyk54, c8zsunc9,
# run_id = '32jqyk54'
# total_statistics = False

# # Get run
# api = wandb.Api()
# run = api.run(f'oafish/cellTRIP/{run_id}')
# config = defaultdict(lambda: {})
# for k, v in run.config.items():
#     dict_name, key = k.split('/')
#     config[dict_name][key] = v
# config = dict(config)

# # Parameters
# dataset_name = config['data']['dataset']
# imputation_target = config['env']['reward_distance_target']
# dimensions = config['env']['dim']
# notebook_seed = np.random.randint(2**32)

# # Apply seed
# torch.manual_seed(notebook_seed)
# if torch.cuda.is_available(): torch.cuda.manual_seed(notebook_seed)
# np.random.seed(notebook_seed)

# # Derivatives
# RUN_FOLDER = os.path.join(BASE_FOLDER, 'runs', dataset_name)

# # Load data and save to file
# modalities, types, features = data.load_data(dataset_name, DATA_FOLDER)
# ppc = celltrip.utilities.Preprocessing(**config['data'])  # Potentially mismatched if sampled
# modalities, types = ppc.fit_transform(modalities, types, total_statistics=total_statistics)

# if not os.path.exists(RUN_FOLDER): os.makedirs(RUN_FOLDER)
# for i in range(len(modalities)):
#     modality = modalities[i]

#     # Regular matrices (ManiNetCluster, JAMIE)
#     np.savetxt(os.path.join(RUN_FOLDER, f'X{i+1}.txt'), modality, delimiter='\t')

#     # Similarity matrices (MMD-MA)
#     modality_z = modality - modality.mean(axis=0, keepdims=True) / modality.std(axis=0, keepdims=True)
#     similarity = np.matmul(modality_z, modality_z.T)
#     np.savetxt(os.path.join(RUN_FOLDER, f'X{i+1}_sim.tsv'), similarity, delimiter='\t')

#     # Anndata matrices (scVI)
#     # adata = sc.AnnData(modalities[0])
#     # adata.var_names = features[0] if isinstance(features[0][0], str) else [f'Feature_{fi}' for fi in features[0]]
#     # adata.obs_names = [f'Cell_{j}' for j in range(len(adata.obs_names))]
#     # adata.obs['cell_type'] = types[0][:, 0]
#     # adata.obs['time'] = types[0][:, -1]
#     # # adata.obs['batch'] = 0
#     # adata.write(os.path.join(RUN_FOLDER, f'X{i+1}.h5ad'), compression='gzip')

#     # HDFS
#     # https://github.com/scverse/anndata/issues/595#issuecomment-1824376236
#     concatenated_modalities = np.concatenate(modalities, axis=-1)
#     barcodes = [f'Cell {i}' for i in range(concatenated_modalities.shape[0])]
#     feature_types = modalities[0].shape[1] * ['Gene Expression'] + modalities[1].shape[1] * ['Peaks']
#     feature_names = np.concatenate(features)
#     feature_ids = np.array(np.arange(feature_names.shape[0]), dtype='str')
#     genome = concatenated_modalities.shape[1] * ['Something']
#     sparse_data = scipy.sparse.csr_matrix(concatenated_modalities)

#     def int_max(x):
#         return int(max(np.floor(len(str(int(max(x)))) / 4), 1) * 4)
#     def str_max(x):
#         return max([len(i) for i in x])

#     with h5py.File(os.path.join(RUN_FOLDER, f'X.h5'), 'w') as f:
#         grp = f.create_group('matrix')
#         grp.create_dataset('barcodes', data=np.array(barcodes, dtype=f'|S{str_max(barcodes)}'))
#         grp.create_dataset('data', data=np.array(sparse_data.data, dtype=f'<i{int_max(sparse_data.data)}'))
#         ftrs = grp.create_group('features')
#         # # this group will lack the following keys:
#         # # '_all_tag_keys', 'feature_type', 'genome', 'id', 'name', 'pattern', 'read', 'sequence'
#         ftrs.create_dataset('feature_type', data=np.array(feature_types, dtype=f'|S{str_max(feature_types)}'))
#         ftrs.create_dataset('genome', data=np.array(genome, dtype=f'|S{str_max(genome)}'))
#         ftrs.create_dataset('id', data=np.array(feature_ids, dtype=f'|S{str_max(feature_ids)}'))
#         ftrs.create_dataset('name', data=np.array(feature_names, dtype=f'|S{str_max([str(fn) for fn in feature_names])}'))
#         grp.create_dataset('indices', data=np.array(sparse_data.indices, dtype=f'<i{int_max(sparse_data.indices)}'))
#         grp.create_dataset('indptr', data=np.array(sparse_data.indptr, dtype=f'<i{int_max(sparse_data.indptr)}'))
#         grp.create_dataset('shape', data=np.array(sparse_data.shape[::-1], dtype=f'<i{int_max(sparse_data.shape)}'))

# # Preview h5 files
# # print('Generated File')
# # with h5py.File(os.path.join(RUN_FOLDER, f'X.h5'), 'r') as f: celltrip.utilities.h5_tree(f)
# # print('\nBaseline File')
# # with h5py.File('/home/thema/Downloads/DM_rep4.h5', 'r') as f: celltrip.utilities.h5_tree(f)

# Integration Methods

In [7]:
# if imputation_target is None:
#     # LMA
#     # https://github.com/namtk/ManiNetCluster/tree/master/inst/python
#     method_name = 'LMA'
#     print(f'Running {method_name}')
#     new_wd = os.path.join(RUN_FOLDER, method_name)
#     if not os.path.exists(new_wd): os.makedirs(new_wd)
#     os.chdir(new_wd)

#     !conda run -n maninetcluster \
#     python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
#     {os.path.join(RUN_FOLDER, 'X1.txt')} \
#     {os.path.join(RUN_FOLDER, 'X2.txt')} \
#     --align lma \
#     -p {dimensions}

#     os.chdir(BASE_FOLDER)


#     # CCA
#     # https://github.com/namtk/ManiNetCluster/tree/master/inst/python
#     method_name = 'CCA'
#     print(f'Running {method_name}')
#     new_wd = os.path.join(RUN_FOLDER, method_name)
#     if not os.path.exists(new_wd): os.makedirs(new_wd)
#     os.chdir(new_wd)

#     !conda run -n maninetcluster \
#     python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
#     {os.path.join(RUN_FOLDER, 'X1.txt')} \
#     {os.path.join(RUN_FOLDER, 'X2.txt')} \
#     --align cca \
#     -p {dimensions}

#     os.chdir(BASE_FOLDER)


#     # NLMA
#     # https://github.com/namtk/ManiNetCluster/tree/master/inst/python
#     method_name = 'NLMA'
#     print(f'Running {method_name}')
#     new_wd = os.path.join(RUN_FOLDER, method_name)
#     if not os.path.exists(new_wd): os.makedirs(new_wd)
#     os.chdir(new_wd)

#     !conda run -n maninetcluster \
#     python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
#     {os.path.join(RUN_FOLDER, 'X1.txt')} \
#     {os.path.join(RUN_FOLDER, 'X2.txt')} \
#     --align nlma \
#     -p {dimensions}

#     os.chdir(BASE_FOLDER)


#     # JAMIE
#     method_name = 'JAMIE'
#     print(f'Running {method_name}')
#     new_wd = os.path.join(RUN_FOLDER, method_name)
#     if not os.path.exists(new_wd): os.makedirs(new_wd)
#     os.chdir(new_wd)

#     !conda run -n jamie \
#     python {os.path.join(BASE_FOLDER, 'jamie_helper.py')} \
#     {os.path.join(RUN_FOLDER, 'X1.txt')} \
#     {os.path.join(RUN_FOLDER, 'X2.txt')} \
#     -p {dimensions} \
#     -s {notebook_seed} \
#     --suffix {notebook_seed}

#     os.chdir(BASE_FOLDER)


#     # MMD-MA
#     # https://bitbucket.org/noblelab/2019_mmd_wabi/src/master/manifoldAlignDistortionPen_mmd_multipleStarts.py
#     method_name = 'MMD-MA'
#     print(f'Running {method_name}')
#     new_wd = os.path.join(RUN_FOLDER, method_name)
#     if not os.path.exists(new_wd): os.makedirs(new_wd)
#     os.chdir(new_wd)

#     fname1, fname2 = f'alpha_hat_{notebook_seed}_10000.txt', f'beta_hat_{notebook_seed}_10000.txt'
#     !conda run -n mmdma \
#     python {os.path.join(BASE_FOLDER, '2019_mmd_wabi/manifoldAlignDistortionPen_mmd_multipleStarts.py')} \
#     {os.path.join(RUN_FOLDER, 'X1_sim.tsv')} \
#     {os.path.join(RUN_FOLDER, 'X2_sim.tsv')} \
#     --seed {notebook_seed} \
#     --p {dimensions}
#     !python {os.path.join(BASE_FOLDER, 'mmd_helper.py')} \
#     {fname1} \
#     {fname2} \
#     --suffix {notebook_seed}

#     os.chdir(BASE_FOLDER)

# Imputation Methods

In [8]:
def random_imputation(X, Y, seed, **kwargs):
    # Seed and generate
    torch.manual_seed(seed)
    projection = torch.distributions.Normal(0, 1).sample(Y.shape)
    np.savetxt(f'I_{seed}.txt', projection)


In [9]:
def knn_imputation(X, Y, seed, *, train_mask, n_neighbors=10):
    import sklearn.neighbors
    
    # Fit model
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)
    knn.fit(X[train_mask], Y[train_mask])
    projection = knn.predict(X)
    np.savetxt(f'I_{seed}.txt', projection)
    

In [10]:
def mlp_imputation(X, Y, seed, *, train_mask):
    import sklearn.neural_network
    np.random.seed(seed)

    # Fit model
    mlp = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(128,), max_iter=1_000)
    mlp.fit(X[train_mask], Y[train_mask])
    projection = mlp.predict(X)
    np.savetxt(f'I_{seed}.txt', projection)
    

In [11]:
def jamie_imputation(X, Y, seed, *, train_mask):
    # https://github.com/Oafish1/JAMIE
    import subprocess

    # Generate files
    fname_X = os.path.join(RUN_FOLDER, 'X.txt')
    fname_Y = os.path.join(RUN_FOLDER, 'Y.txt')
    fname_train_mask = os.path.join(RUN_FOLDER, 'train_mask.txt')
    if not os.path.exists(fname_X): np.savetxt(fname_X, X)
    if not os.path.exists(fname_Y): np.savetxt(fname_Y, Y)
    if not os.path.exists(fname_train_mask): np.savetxt(fname_train_mask, train_mask)

    # Run
    subprocess.run(
        f'conda run -n jamie '
        f'python "{os.path.join(BASE_FOLDER, "jamie_helper.py")}" '
        f'"{fname_X}" "{fname_Y}" -m "{fname_train_mask}" '
        f'-t 2 -p {dim} -s {seed} --suffix "{seed}"',
        stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, shell=True)


In [12]:
    # BABEL
    # INCOMPATIBLE
    # https://github.com/wukevin/babel
    # method_name = 'BABEL'
    # print(f'Running {method_name}')
    # new_wd = os.path.join(RUN_FOLDER, method_name)
    # if not os.path.exists(new_wd): os.makedirs(new_wd)
    # os.chdir(new_wd)

    # !conda run -n babel \
    # python {os.path.join(BASE_FOLDER, 'babel/bin/train_model.py')} \
    # --data {os.path.join(RUN_FOLDER, 'X.h5')} \
    # --outdir {new_wd}
    # !conda run -n babel \
    # python {os.path.join(BASE_FOLDER, 'babel/bin/predict_model.py')} \
    # --checkpoint {os.path.join(new_wd, net_asdf)} \
    # --data {os.path.join(RUN_FOLDER, 'X.h5')} \
    # --outdir {new_wd}

    # os.chdir(BASE_FOLDER)

In [13]:
    # scVI
    # Not done
    # method_name = 'scVI'
    # print(f'Running {method_name}')
    # new_wd = os.path.join(RUN_FOLDER, method_name)
    # if not os.path.exists(new_wd): os.makedirs(new_wd)
    # os.chdir(new_wd)

    # X_fname = os.path.join(RUN_FOLDER, 'X1.txt')
    # Y_fname = os.path.join(RUN_FOLDER, 'X2.txt')

    # import numpy as np
    # import scvi

    # scvi.settings.seed = 42

    # X = np.loadtxt(X_fname)
    # Y = np.loadtxt(Y_fname)

    # scvi.model.SCVI.setup_anndata

    # os.chdir(BASE_FOLDER)

In [14]:
seeds = [42, 128, 256, 512, 1024]
imputation_methods = {
    'Random': random_imputation,
    'KNN': knn_imputation,
    'MLP': mlp_imputation,
    # 'JAMIE': jamie_imputation,
}

# Load data
    # NOTE: Currently incompatible with multi-multi
X, Y = env.get_input_modalities()[0].cpu(), env.get_target_modalities()[0].cpu()
train_mask = pd.DataFrame(full_train_mask, index=env.dataloader.adatas[0].obs.index).loc[env.keys].to_numpy().flatten()

for method, seed in itertools.product(imputation_methods, seeds):
    # CLI
    print(f'Running {method}')

    # Create folder and change wd
    new_wd = os.path.join(RUN_FOLDER, method)
    if not os.path.exists(new_wd): os.makedirs(new_wd)
    os.chdir(new_wd)

    # Run function
    imputation_methods[method](X, Y, seed, train_mask=train_mask)

    # Revert wd
    os.chdir(BASE_FOLDER)

Running Random
Running Random
Running Random
Running Random
Running Random
Running KNN
Running KNN
Running KNN
Running KNN
Running KNN
Running MLP
Running MLP
Running MLP
Running MLP
Running MLP


# Perturbation Methods

In [None]:
# # Variance
# # https://shap.readthedocs.io/en/latest/
# method_name = 'Variance'
# print(f'Running {method_name}')
# new_wd = os.path.join(RUN_FOLDER, method_name)
# if not os.path.exists(new_wd): os.makedirs(new_wd)
# os.chdir(new_wd)

#  # Load data
# X1, X2 = np.loadtxt(os.path.join(RUN_FOLDER, 'X1.txt')), np.loadtxt(os.path.join(RUN_FOLDER, 'X2.txt'))
# dataset = [X1, X2]
# dataset = ppc.inverse_transform(dataset)

# # Get variance
# importance = [np.var(X, axis=0) for X in dataset]
# importance = [imp / imp.sum() for imp in importance]

# # Write to file
# for i, imp in enumerate(importance):
#     if imp is not None: np.savetxt(f'F{i+1}.txt', imp)

# os.chdir(BASE_FOLDER)

Running Variance


NameError: name 'ppc' is not defined

# Trajectory Methods