In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly as py
import pandas as pd
from chart_studio.plotly import plot, iplot

# from plotly.offline import init_notebook_mode, iplot
from tqdm import tqdm_notebook

from scvi.dataset import PbmcDataset, GeneExpressionDataset
from scvi.models import VAE, IAVAE
from scvi.inference import UnsupervisedTrainer
from scvi.utils import demultiply, make_dir_if_necessary, predict_de_genes, save_fig
from scvi_utils import estimate_de_proba, estimate_lfc_density, estimate_lfc_mean
from R_interop import all_predictions


N_EPOCHS = 10
DELTA = 0.5
# SIZES = [5, 10, 20, 30, 50, 100]
SIZE = 100
SIZES = [SIZE]
DO_CLOUD = True
N_SIZES = len(SIZES)

Q0 = 5e-2
N_TRAININGS = 1
N_PICKS = 2

np.random.seed(42)

PATH_TO_SCRIPTS = "/home/ubuntu/conquer_comparison/scripts"
DIR_PATH = 'lfc_estimates/pbmc'
make_dir_if_necessary(DIR_PATH)

# Import Dataset and Training scVI-based models

In [None]:
DIR_PATH = "lfc_estimates/null"
make_dir_if_necessary(DIR_PATH)

## PBMC Dataset

In [None]:
all_dataset = PbmcDataset()
all_dataset.subsample_genes(2000)

unique_elements, counts_elements = np.unique(
    all_dataset.labels.squeeze(), return_counts=True
)

df = pd.DataFrame(dict(counts=counts_elements, cell_types=all_dataset.cell_types))
px.scatter(df, y="counts", x="cell_types")

mask = all_dataset.labels.squeeze() == 2
# By default all cells are labelled 2
fake_labels = 2.0 * np.ones(len(all_dataset))
# Except cluster 2 that is either 0 or 1
fake_labels[mask] = np.random.random(mask.sum()) >= 0.5

dataset = GeneExpressionDataset()
dataset.populate_from_data(
    X=all_dataset.X,
    labels=fake_labels,
    batch_indices=all_dataset.batch_indices,
)

n_genes = dataset.nb_genes
is_significant_de = np.zeros(n_genes, dtype=bool)

In [None]:
print(np.unique(dataset.labels.squeeze()))

## Save data

In [None]:
n_examples = len(dataset)
labels = dataset.labels.squeeze()
interesting_indices = np.where((labels == 0) | (labels == 1))[0]
TEST_INDICES = np.random.permutation(interesting_indices)[:1001]

x_test, y_test = dataset.X[TEST_INDICES, :], dataset.labels[TEST_INDICES, :].squeeze()
data_path = os.path.join(DIR_PATH, 'data.npy')
labels_path = os.path.join(DIR_PATH, 'labels.npy')

np.save(
    data_path,
    np.array(x_test.todense()).squeeze().astype(int)
)
np.savetxt(
    labels_path,
    y_test.squeeze()
)

## Train parameters

In [None]:
mdl_params = dict(
    iaf=dict(n_hidden=128, n_layers=1, do_h=True, n_latent=10, t=4),
    mf=dict(n_hidden=128, n_layers=1, n_latent=10),
    iaf_k5=dict(n_hidden=128, n_layers=1, do_h=True, n_latent=10, t=4),
    mf_k5=dict(n_hidden=128, n_layers=1, n_latent=10),
)
train_params = dict(
    iaf=dict(ratio_loss=True, test_indices=TEST_INDICES),
    mf=dict(ratio_loss=True, test_indices=TEST_INDICES),
    iaf_k5=dict(ratio_loss=True, test_indices=TEST_INDICES, k_importance_weighted=5),
    mf_k5=dict(ratio_loss=True, test_indices=TEST_INDICES, k_importance_weighted=5)
)
train_fn_params = dict(
    iaf=dict(n_epochs=N_EPOCHS, lr=1e-3),
    mf=dict(n_epochs=N_EPOCHS, lr=1e-3),
    iaf_k5=dict(n_epochs=N_EPOCHS, lr=1e-3),
    mf_k5=dict(n_epochs=N_EPOCHS, lr=1e-3),
)

# Competitors

In [None]:
other_predictions = all_predictions(
    n_genes=n_genes, 
    n_picks=N_PICKS, 
    sizes=SIZES, 
    data_path=data_path, 
    labels_path=labels_path,
    path_to_scripts=PATH_TO_SCRIPTS
)

# Experiments

## LFC Error

In [None]:
lfcs_scVI = estimate_lfc_mean(
    VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
    sizes=[SIZE],
    n_picks=N_PICKS,
)[SIZE]

lfcs_scVI_ia = estimate_lfc_mean(
    IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"],
    sizes=[SIZE],
    n_picks=N_PICKS,
)[SIZE]

In [None]:
print(other_predictions["mast"]["lfc"].shape)

lfcs_deseq2 = other_predictions["deseq2"]["lfc"]
lfcs_edger = other_predictions["edger"]["lfc"]
lfcs_mast = other_predictions["mast"]["lfc"]
assert lfcs_mast.shape == (N_PICKS, n_genes), lfcs_mast.shape

In [None]:
print(lfcs_deseq2.shape)
print(lfcs_edger.shape)
print(lfcs_mast.shape)
print(lfcs_scVI.shape)
print(lfcs_scVI_ia.shape)

In [None]:
def l2_err(vals):
    res = 0.5 * (vals ** 2) ** (0.5)
    res = np.nanmean(res, axis=-1)
    return res


scVI_errs = l2_err(lfcs_scVI)
scVI_ia_errs = l2_err(lfcs_scVI_ia)
deseq2_errs = l2_err(lfcs_deseq2)
edger_errs = l2_err(lfcs_edger)
mast_errs = l2_err(lfcs_mast)

trace1 = go.Box(y=scVI_errs, name="scVI")
trace2 = go.Box(y=scVI_ia_errs, name="scVI IAF")
trace3 = go.Box(y=deseq2_errs, name="DeSeq2")
trace4 = go.Box(y=edger_errs, name="edgeR")
trace5 = go.Box(y=mast_errs, name="MAST")
traces = [trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(title="L2 Error on Null Real Data")


fig = go.Figure(traces, layout=layout)
# save_fig(fig, filename="pbmc_null_lfc_err", do_cloud=DO_CLOUD)
iplot(fig, filename="pbmc_null_lfc_err")

In [None]:
from scipy.stats import mannwhitneyu, ttest_1samp
import plotly.figure_factory as ff


def a_better_b_sign(a, b):
    _, p = mannwhitneyu(a, b, alternative="less")
    return p


vals = [scVI_errs, scVI_ia_errs, deseq2_errs, edger_errs, mast_errs]

x = ["scVI", "scVI IAF", "DeSeq2", "edgeR", "MAST"]

mat = [[a_better_b_sign(b, a) for a in vals] for b in vals]

fig = ff.create_annotated_heatmap(z=mat, x=x, y=x)
fig = fig.update_layout(
    title_text="P values for error test on null data (pval that line a better than col b)"
)

# save_fig(fig, filename="pbmc_null_lfc_err", do_cloud=DO_CLOUD)
iplot(fig, filename="pbmc_null_lfc_err_sign")

## Significant DE genes

In [None]:
de_probas_mf = estimate_de_proba(
    VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
    sizes=[SIZE],
    n_trainings=N_TRAININGS,
    n_picks=N_PICKS,
).squeeze()
de_probas_ia = estimate_de_proba(
    IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"],
    sizes=[SIZE],
    n_trainings=N_TRAININGS,
    n_picks=N_PICKS,
).squeeze()

In [None]:
def compute_nb_predicted(probas_arr):
#     nb_predicted = np.zeros((N_TRAININGS, N_PICKS))
#     for i in range(N_TRAININGS):
#         for k in range(N_PICKS):
#             probs_pred_de = probas_arr[i, k, :]
#             is_pred_de = predict_de_genes(probs_pred_de, desired_fdr=Q0)
#             nb_predicted[i, j, k] = is_pred_de.sum()
    
    nb_predicted = np.zeros((N_PICKS))
    for k in range(N_PICKS):
        probs_pred_de = probas_arr[k, :]
        is_pred_de = predict_de_genes(probs_pred_de, desired_fdr=Q0)
        nb_predicted[k] = is_pred_de.sum()
    return nb_predicted

def naive_is_de(probas_arr):
    probas_arr[np.isnan(probas_arr)] = 0.0
    return (probas_arr <= Q0).sum(-1)

In [None]:
nb_predicted_scVI = compute_nb_predicted(de_probas_mf)
nb_predicted_scVI_ia = compute_nb_predicted(de_probas_ia)

nb_predicted_deseq2 = naive_is_de(other_predictions['deseq2']['pval'])
nb_predicted_edger = naive_is_de(other_predictions['edger']['pval'])
nb_predicted_mast = naive_is_de(other_predictions['mast']['pval'])

In [None]:
bounds_scVI = [nb_predicted_scVI.min(), nb_predicted_scVI.max()]
bounds_scVI_ia = [nb_predicted_scVI_ia.min(), nb_predicted_scVI_ia.max()]
bounds_deseq2 = [nb_predicted_deseq2.min(), nb_predicted_deseq2.max()]
bounds_edger = [nb_predicted_edger.min(), nb_predicted_edger.max()]
bounds_mast = [nb_predicted_mast.min(), nb_predicted_mast.max()]

In [None]:
pd.DataFrame(
    dict(
        FP=[
            bounds_scVI[1] - bounds_scVI[0], 
            bounds_scVI_ia[1] - bounds_scVI_ia[0], 
            bounds_deseq2[1] - bounds_deseq2[0], 
            bounds_edger[1] - bounds_edger[0], 
            bounds_mast[1] - bounds_mast[0]
        ],
    ),
    index=['MF', 'IAF', 'DESeq2', 'EdgeR', 'MAST']
).T

# Old

In [None]:
# scVI_robustness = get_robustness(lfcs_scVI)
# scVI_ia_robustness = get_robustness(lfcs_scVI_ia)
# deseq2_robustness = get_robustness(lfcs_deseq2)
# edge_r_robustness = get_robustness(lfcs_edge_r)
# mast_robustness = get_robustness(lfcs_mast)

# trace1 = go.Box(y=scVI_robustness, name="scVI")
# trace2 = go.Box(y=scVI_ia_robustness, name="scVI IAF")
# trace3 = go.Box(y=deseq2_robustness, name="DeSeq2")
# trace4 = go.Box(y=edge_r_robustness, name="edgeR")
# trace5 = go.Box(y=mast_robustness, name="MAST")
# traces = [trace1, trace2, trace3, trace4, trace5]

# layout = go.Layout(title="Robustness on Null Real Data")

# fig = go.Figure(traces, layout=layout)
# # fig.show()
# iplot(fig, filename='robustness_null_ercc_dataset')

In [None]:
# from scipy.stats import mannwhitneyu, ttest_1samp
# import plotly.figure_factory as ff


# def a_better_b_sign(a, b):
#     _, p = mannwhitneyu(a, b, alternative="less")
#     return p


# vals = [
#     scVI_robustness,
#     scVI_ia_robustness,
#     deseq2_robustness,
#     edge_r_robustness,
#     mast_robustness,
# ]

# x = ["scVI", "scVI IAF", "DeSeq2", "edgeR", "MAST"]

# mat = [[a_better_b_sign(b, a) for a in vals] for b in vals]

# fig = ff.create_annotated_heatmap(z=mat, x=x, y=x)
# fig.update_layout(
#     title_text="P values for robustness test on null data (pval that line a better than col b)"
# )

# iplot(fig, filename='significance_robustness_null_ercc_dataset')