In [1]:
import h5py
import numpy as np
import scanpy as sc
import scipy.sparse
import torch

import inspect
import os
import sys

sys.path.insert(0, os.path.abspath('../examples'))
import data
import celltrip

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
BASE_FOLDER = os.path.abspath('')
DATA_FOLDER = os.path.join(BASE_FOLDER, '../data')

- TODO
  - Run models multiple times with different seeds to get plotted variance

## Data Setup

In [2]:
# Parameters
dataset_name = 'MMD-MA'
imputation_target = None
dimensions = 3
seed = 42

# Derivatives
RUN_FOLDER = os.path.join(BASE_FOLDER, 'runs', dataset_name)

# Load data and save to file
modalities, types, features = data.load_data(dataset_name, DATA_FOLDER)

if not os.path.exists(RUN_FOLDER): os.makedirs(RUN_FOLDER)
for i in range(len(modalities)):
    modality = modalities[i]

    # Regular matrices (ManiNetCluster, JAMIE)
    np.savetxt(os.path.join(RUN_FOLDER, f'X{i+1}.txt'), modality, delimiter='\t')

    # Similarity matrices (MMD-MA)
    modality_z = modality - modality.mean(axis=0, keepdims=True) / modality.std(axis=0, keepdims=True)
    similarity = np.matmul(modality_z, modality_z.T)
    np.savetxt(os.path.join(RUN_FOLDER, f'X{i+1}_sim.tsv'), similarity, delimiter='\t')

    # Anndata matrices (scVI)
    # adata = sc.AnnData(modalities[0])
    # adata.var_names = features[0] if isinstance(features[0][0], str) else [f'Feature_{fi}' for fi in features[0]]
    # adata.obs_names = [f'Cell_{j}' for j in range(len(adata.obs_names))]
    # adata.obs['cell_type'] = types[0][:, 0]
    # adata.obs['time'] = types[0][:, -1]
    # # adata.obs['batch'] = 0
    # adata.write(os.path.join(RUN_FOLDER, f'X{i+1}.h5ad'), compression='gzip')

    # HDFS
    # https://github.com/scverse/anndata/issues/595#issuecomment-1824376236
    concatenated_modalities = np.concatenate(modalities, axis=-1)
    barcodes = [f'Cell {i}' for i in range(concatenated_modalities.shape[0])]
    feature_types = modalities[0].shape[1] * ['Gene Expression'] + modalities[1].shape[1] * ['Peaks']
    feature_names = np.concatenate(features)
    feature_ids = np.array(np.arange(feature_names.shape[0]), dtype='str')
    genome = concatenated_modalities.shape[1] * ['Something']
    sparse_data = scipy.sparse.csr_matrix(concatenated_modalities)

    def int_max(x):
        return int(max(np.floor(len(str(int(max(x)))) / 4), 1) * 4)
    def str_max(x):
        return max([len(i) for i in x])

    with h5py.File(os.path.join(RUN_FOLDER, f'X.h5'), 'w') as f:
        grp = f.create_group('matrix')
        grp.create_dataset('barcodes', data=np.array(barcodes, dtype=f'|S{str_max(barcodes)}'))
        grp.create_dataset('data', data=np.array(sparse_data.data, dtype=f'<i{int_max(sparse_data.data)}'))
        ftrs = grp.create_group('features')
        # # this group will lack the following keys:
        # # '_all_tag_keys', 'feature_type', 'genome', 'id', 'name', 'pattern', 'read', 'sequence'
        ftrs.create_dataset('feature_type', data=np.array(feature_types, dtype=f'|S{str_max(feature_types)}'))
        ftrs.create_dataset('genome', data=np.array(genome, dtype=f'|S{str_max(genome)}'))
        ftrs.create_dataset('id', data=np.array(feature_ids, dtype=f'|S{str_max(feature_ids)}'))
        ftrs.create_dataset('name', data=np.array(feature_names, dtype=f'|S{str_max([str(fn) for fn in feature_names])}'))
        grp.create_dataset('indices', data=np.array(sparse_data.indices, dtype=f'<i{int_max(sparse_data.indices)}'))
        grp.create_dataset('indptr', data=np.array(sparse_data.indptr, dtype=f'<i{int_max(sparse_data.indptr)}'))
        grp.create_dataset('shape', data=np.array(sparse_data.shape[::-1], dtype=f'<i{int_max(sparse_data.shape)}'))

# Preview h5 files
# print('Generated File')
# with h5py.File(os.path.join(RUN_FOLDER, f'X.h5'), 'r') as f: celltrip.utilities.h5_tree(f)
# print('\nBaseline File')
# with h5py.File('/home/thema/Downloads/DM_rep4.h5', 'r') as f: celltrip.utilities.h5_tree(f)

## Integration Methods

In [5]:
# LMA
# https://github.com/namtk/ManiNetCluster/tree/master/inst/python
new_wd = os.path.join(RUN_FOLDER, 'LMA')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n maninetcluster \
 python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
 {os.path.join(RUN_FOLDER, 'X1.txt')} \
 {os.path.join(RUN_FOLDER, 'X2.txt')} \
 --align lma \
 -p {dimensions}

os.chdir(BASE_FOLDER)

In [6]:
# CCA
# https://github.com/namtk/ManiNetCluster/tree/master/inst/python
new_wd = os.path.join(RUN_FOLDER, 'CCA')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n maninetcluster \
 python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
 {os.path.join(RUN_FOLDER, 'X1.txt')} \
 {os.path.join(RUN_FOLDER, 'X2.txt')} \
 --align cca \
 -p {dimensions}

os.chdir(BASE_FOLDER)

In [7]:
# NLMA
# https://github.com/namtk/ManiNetCluster/tree/master/inst/python
new_wd = os.path.join(RUN_FOLDER, 'NLMA')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n maninetcluster \
 python {os.path.join(BASE_FOLDER, 'maninetcluster_helper.py')} \
 {os.path.join(RUN_FOLDER, 'X1.txt')} \
 {os.path.join(RUN_FOLDER, 'X2.txt')} \
 --align nlma \
 -p {dimensions}

os.chdir(BASE_FOLDER)

In [None]:
# JAMIE
new_wd = os.path.join(RUN_FOLDER, 'JAMIE')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n jamie \
 python {os.path.join(BASE_FOLDER, 'jamie_helper.py')} \
 {os.path.join(RUN_FOLDER, 'X1.txt')} \
 {os.path.join(RUN_FOLDER, 'X2.txt')} \
 -p {dimensions}

os.chdir(BASE_FOLDER)

In [None]:
# MMD-MA
# https://bitbucket.org/noblelab/2019_mmd_wabi/src/master/manifoldAlignDistortionPen_mmd_multipleStarts.py
new_wd = os.path.join(RUN_FOLDER, 'MMD-MA')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n mmdma \
 python {os.path.join(BASE_FOLDER, '2019_mmd_wabi/manifoldAlignDistortionPen_mmd_multipleStarts.py')} \
 {os.path.join(RUN_FOLDER, 'X1_sim.tsv')} \
 {os.path.join(RUN_FOLDER, 'X2_sim.tsv')} \
 --seed {seed} \
 --p {dimensions}
!python {os.path.join(BASE_FOLDER, 'mmd_helper.py')}

os.chdir(BASE_FOLDER)

## Imputation Methods

- Add training and validation sets

In [3]:
# KNN
new_wd = os.path.join(RUN_FOLDER, 'KNN')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

import sklearn.neighbors

# Load data
X1, X2 = np.loadtxt('../X1.txt'), np.loadtxt('../X2.txt')
dataset = [X1, X2]
X, Y = dataset[-imputation_target], dataset[imputation_target]

# Fit model
knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=10)
knn.fit(X, Y)
projection = [None for _ in range(2)]
projection[imputation_target] = knn.predict(X)

# Write to file
for i, proj in enumerate(projection):
    if proj is not None: np.savetxt(f'I{i+1}.txt', proj)

os.chdir(BASE_FOLDER)

In [4]:
# MLP
new_wd = os.path.join(RUN_FOLDER, 'MLP')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

import sklearn.neighbors
import sklearn.neural_network

# Load data
X1, X2 = np.loadtxt('../X1.txt'), np.loadtxt('../X2.txt')
dataset = [X1, X2]
X, Y = dataset[-imputation_target], dataset[imputation_target]

# Fit model
mlp = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(128,), max_iter=1_000)
mlp.fit(X, Y)
projection = [None for _ in range(2)]
projection[imputation_target] = mlp.predict(X)

# Write to file
for i, proj in enumerate(projection):
    if proj is not None: np.savetxt(f'I{i+1}.txt', proj)

os.chdir(BASE_FOLDER)

In [None]:
# JAMIE
# https://github.com/Oafish1/JAMIE
new_wd = os.path.join(RUN_FOLDER, 'JAMIE')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

!conda run -n jamie \
 python {os.path.join(BASE_FOLDER, 'jamie_helper.py')} \
 {os.path.join(RUN_FOLDER, 'X1.txt')} \
 {os.path.join(RUN_FOLDER, 'X2.txt')} \
 -t {imputation_target+1} \
 -p {dimensions}

os.chdir(BASE_FOLDER)

In [6]:
# BABEL
# https://github.com/wukevin/babel
if False:
    new_wd = os.path.join(RUN_FOLDER, 'babel')
    if not os.path.exists(new_wd): os.makedirs(new_wd)
    os.chdir(new_wd)

    !conda run -n babel \
    python {os.path.join(BASE_FOLDER, 'babel/bin/train_model.py')} \
    --data {os.path.join(RUN_FOLDER, 'X.h5')} \
    --outdir {new_wd}
    !conda run -n babel \
    python {os.path.join(BASE_FOLDER, 'babel/bin/predict_model.py')} \
    --checkpoint {os.path.join(new_wd, net_asdf)} \
    --data {os.path.join(RUN_FOLDER, 'X.h5')} \
    --outdir {new_wd}

    os.chdir(BASE_FOLDER)

In [7]:
# scVI
if False:  # Not done
    new_wd = os.path.join(RUN_FOLDER, 'scVI')
    if not os.path.exists(new_wd): os.makedirs(new_wd)
    os.chdir(new_wd)


    X_fname = os.path.join(RUN_FOLDER, 'X1.txt')
    Y_fname = os.path.join(RUN_FOLDER, 'X2.txt')

    import numpy as np
    import scvi

    scvi.settings.seed = 42

    X = np.loadtxt(X_fname)
    Y = np.loadtxt(Y_fname)

    scvi.model.SCVI.setup_anndata


    os.chdir(BASE_FOLDER)

## Perturbation Methods

In [3]:
# Variance
# https://shap.readthedocs.io/en/latest/
new_wd = os.path.join(RUN_FOLDER, 'variance')
if not os.path.exists(new_wd): os.makedirs(new_wd)
os.chdir(new_wd)

 # Load data
X1, X2 = np.loadtxt(os.path.join(RUN_FOLDER, 'X1.txt')), np.loadtxt(os.path.join(RUN_FOLDER, 'X2.txt'))
dataset = [X1, X2]

# Get variance
importance = [np.var(X, axis=0) for X in dataset]
importance = [imp / imp.sum() for imp in dataset]

# Write to file
for i, imp in enumerate(importance):
    if imp is not None: np.savetxt(f'F{i+1}.txt', imp)

os.chdir(BASE_FOLDER)

## Trajectory Methods

- Implement