In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly as py
import pandas as pd
from chart_studio.plotly import plot, iplot

# from plotly.offline import init_notebook_mode, iplot
from tqdm import tqdm_notebook

from scvi.dataset import PbmcDataset
from scvi.models import VAE, IAVAE
from scvi.inference import UnsupervisedTrainer
from scvi.utils import (
    demultiply,
    make_dir_if_necessary,
    predict_de_genes,
    save_fig,
    save_pickle,
)
from scvi_utils import (
    estimate_de_proba,
    estimate_lfc_density,
    estimate_lfc_mean,
    train_model,
    multi_train_estimates,
)
from R_interop import all_predictions, all_de_predictions


N_EPOCHS = 200
DELTA = 0.5
# SIZES = [5, 10, 20, 30, 50, 100]
MODE = "cloud"
SIZE = 100
SIZES = [SIZE]
N_SIZES = len(SIZES)

Q0 = 5e-2
N_TRAININGS = 5
N_PICKS = 10

np.random.seed(42)

PATH_TO_SCRIPTS = "/home/ubuntu/conquer_comparison/scripts"
DIR_PATH = "lfc_estimates/pbmc"
make_dir_if_necessary(DIR_PATH)

label_a = 0
label_b = 4

In [None]:
import chart_studio.plotly as py

py.sign_in("pierreboyeau", "2wvdnWZ2Qut1zD07ADVy")

# Import Dataset

In [None]:
dataset = PbmcDataset()

unique_elements, counts_elements = np.unique(
    dataset.labels.squeeze(), return_counts=True
)

df = pd.DataFrame(dict(counts=counts_elements, cell_types=dataset.cell_types))
fig = px.scatter(df, y="counts", x="cell_types")
fig.show()
n_genes = dataset.nb_genes

In [None]:
print("Cell types: ", dataset.cell_types)
print('Gene names: ', dataset.gene_names)

microarray_info = dataset.de_metadata.set_index('ENSG')
microarray_info = microarray_info.loc[dataset.gene_names]

display(dataset.de_metadata.head())
print(dataset.de_metadata.info())

In [None]:
n_examples = len(dataset)
labels = dataset.labels.squeeze()
# interesting_indices = np.where((labels == label_a) | (labels == label_b))[0]
# TEST_INDICES = np.random.permutation(interesting_indices)[:800]
TEST_INDICES = np.random.permutation(len(dataset))[:3000]

x_test, y_test = dataset.X[TEST_INDICES, :], dataset.labels[TEST_INDICES, :].squeeze()
data_path = os.path.join(DIR_PATH, 'data.npy')
labels_path = os.path.join(DIR_PATH, 'labels.npy')

np.save(
    data_path,
    np.array(x_test.todense()).squeeze().astype(int)
)
np.savetxt(
    labels_path,
    y_test.squeeze()
)

## Train parameters

In [None]:
mdl_params = dict(
    iaf=dict(n_hidden=128, n_layers=1, do_h=True, n_latent=10, t=4),
    mf=dict(n_hidden=128, n_layers=1, n_latent=10),
    iaf_k5=dict(n_hidden=128, n_layers=1, do_h=True, n_latent=10, t=4),
    mf_k5=dict(n_hidden=128, n_layers=1, n_latent=10),
)
train_params = dict(
    iaf=dict(ratio_loss=True, test_indices=TEST_INDICES),
    mf=dict(ratio_loss=True, test_indices=TEST_INDICES),
    iaf_k5=dict(ratio_loss=True, test_indices=TEST_INDICES, k_importance_weighted=5),
    mf_k5=dict(ratio_loss=True, test_indices=TEST_INDICES, k_importance_weighted=5)
)
train_fn_params = dict(
    iaf=dict(n_epochs=N_EPOCHS, lr=1e-2),
    mf=dict(n_epochs=N_EPOCHS, lr=1e-2),
    iaf_k5=dict(n_epochs=N_EPOCHS, lr=1e-2),
    mf_k5=dict(n_epochs=N_EPOCHS, lr=1e-2),
)

# Competitors


In [None]:
other_predictions = all_predictions(
    filename=os.path.join(DIR_PATH, "all_predictions.pickle"),
    n_genes=n_genes,
    n_picks=N_PICKS,
    sizes=SIZES,
    data_path=data_path,
    labels_path=labels_path,
    path_to_scripts=PATH_TO_SCRIPTS,
    label_a=label_a,
    label_b=label_b,
    all_nature=True
)

other_predictions = all_de_predictions(
    other_predictions, significance_level=Q0, delta=DELTA
)

In [None]:
# # Modif DESeq2
# from R_interop import NDESeq2
# from tqdm import tqdm

# lfcs_deseq2 = np.zeros((N_SIZES, N_PICKS, n_genes))
# pvals_deseq2 = np.zeros((N_SIZES, N_PICKS, n_genes))
# for (size_ix, size) in enumerate(tqdm(SIZES)):
#     for exp in range(N_PICKS):
#         deseq_inference = NDESeq2(
#             A=size,
#             B=size,
#             data=data_path,
#             labels=labels_path,
#             cluster=(label_a, label_b),
#             path_to_scripts=PATH_TO_SCRIPTS,
#             lfc_threshold=DELTA
#         )
#         res_df = deseq_inference.fit()
#         lfcs_deseq2[size_ix, exp, :] = res_df["lfc"].values
#         pvals_deseq2[size_ix, exp, :] = res_df["padj"].values
# deseq_res = dict(lfc=lfcs_deseq2.squeeze(), pval=pvals_deseq2.squeeze())

# from scvi.utils import save_pickle

# print(deseq_res['pval'].shape)
# print(other_predictions['edger']['pval'].shape)

# other_predictions["deseq2"] = deseq_res
# save_pickle(data=other_predictions, filename=os.path.join(DIR_PATH, "all_predictions.pickle"))

# Experiments

## Microarray

In [None]:
from sklearn.preprocessing import StandardScaler

new_n_genes = 110
std_scaler = StandardScaler(with_mean=False)
std_scaler.fit(dataset.X.astype(np.float64))
subset_genes = np.argsort(std_scaler.var_)[::-1][:new_n_genes]

# subset_genes = np.arange(n_genes)

### BDT : size 100

In [None]:
def bdt_densities(
    filename, mdl_class, dataset, mdl_params, train_params, train_fn_params, sz=SIZE
):
    res = estimate_lfc_density(
        filename=filename,
        mdl_class=mdl_class,
        dataset=dataset,
        mdl_params=mdl_params,
        train_params=train_params,
        train_fn_params=train_fn_params,
        sizes=[sz],
        n_picks=1,
        label_a=0,
        label_b=4,
        n_samples=100
    )[sz].squeeze()
    return res


lfcs_mf = bdt_densities(
    filename=os.path.join(DIR_PATH, "bdt100MF_new.pickle"),
    mdl_class=VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
)

lfcs_ia = bdt_densities(
    filename=os.path.join(DIR_PATH, "bdt100IAF_new.pickle"),
    mdl_class=IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"],
)

# lfcs_iwia = estimate_lfc_density(
#     IAVAE,
#     dataset=dataset,
#     mdl_params=mdl_params["iaf_k5"],
#     train_params=train_params["iaf_k5"],
#     train_fn_params=train_fn_params["iaf_k5"],
#     sizes=[SIZE],
#     n_picks=1,
#     label_a=label_a,
#     label_b=label_b
# )[SIZE].squeeze()

# lfcs_iwmf = estimate_lfc_density(
#     IAVAE,
#     dataset=dataset,
#     mdl_params=mdl_params["mf_k5"],
#     train_params=train_params["mf_k5"],
#     train_fn_params=train_fn_params["mf_k5"],
#     sizes=[SIZE],
#     n_picks=1,
#     label_a=label_a,
#     label_b=label_b
# )[SIZE].squeeze()

In [None]:
lfcs_ia_100_all = lfcs_ia.reshape((-1, n_genes))
lfcs_mf_100_all = lfcs_mf.reshape((-1, n_genes))

In [None]:
from plotly.subplots import make_subplots

lfcs_mf_est = lfcs_ia.reshape((-1, n_genes))[:, subset_genes]
lfcs_ia_est = lfcs_mf.reshape((-1, n_genes))[:, subset_genes]
lfcs_mf_est_100 = lfcs_mf_est.copy()
lfcs_ia_est_100 = lfcs_ia_est.copy()
lfcs_gt = - microarray_info.BDC_logFC[subset_genes]

print(lfcs_mf_est.shape)
print(lfcs_ia_est.shape)
print(lfcs_gt.shape)

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Mean Field", "Inverse Autoregressive Flows"),
    shared_xaxes=True,
    shared_yaxes=True,
)


def add_plot(fig, lfcs_est_m, lfcs_est_err, row, col):
    fig.add_trace(
        go.Scatter(
            x=lfcs_gt,
            y=lfcs_est_m,
            error_y=dict(type="data", array=lfcs_est_err, visible=True),
            mode="markers",
        ),
        row=row,
        col=col,
    )
    return


add_plot(fig, lfcs_mf_est.mean(0), 2.0*lfcs_mf_est.std(0), row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=[-5, 5],
        y=[-5, 5],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=1,
)
add_plot(fig, lfcs_ia_est.mean(0), 2.0*lfcs_ia_est.std(0), row=1, col=2)
fig.add_trace(
    go.Scatter(
        x=[-3, 3],
        y=[-3, 3],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=2,
)

fig.update_xaxes(title_text="Ground Truth LFC", row=1, col=1)
# fig.update_xaxes(title_text="Ground Truth LFC", row=2, col=1)
fig.update_yaxes(title_text="Predicted LFC", row=1, col=1)
# fig.update_yaxes(title_text="Predicted LFC", row=2, col=1)


fig.update_layout(
    height=600, width=1000, title_text="LFC estimation for {} sample cells B cells/DT cells".format(SIZE)
)
# iplot(fig, filename="pbmc_microarray_lfc_with_uncertainty_{}cells_BDT".format(SIZE), sharing="private")
fig.show()

### BDT Size 10

In [None]:
lfcs_mf = bdt_densities(
    filename=os.path.join(DIR_PATH, "bdt10MF.pickle"),
    mdl_class=VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
    sz=10
)

lfcs_ia = bdt_densities(
    filename=os.path.join(DIR_PATH, "bdt10IAFF.pickle"),
    mdl_class=IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"],
    sz=10
)

In [None]:
lfcs_mf_est = lfcs_ia.reshape((-1, n_genes))
lfcs_ia_est = lfcs_mf.reshape((-1, n_genes))
lfcs_gt = - microarray_info.BDC_logFC[subset_genes]

fig = make_subplots(
    rows=1,
    cols=1,
    subplot_titles=("Mean Field", "Inverse Autoregressive Flows"),
    shared_xaxes=True,
    shared_yaxes=True,
)

add_plot(fig, lfcs_mf_est.mean(0)[subset_genes], 2.0*lfcs_mf_est.std(0)[subset_genes], row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=[-5, 5],
        y=[-5, 5],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=1,
)
# add_plot(fig, lfcs_ia_est.mean(0)[subset_genes], 2.0*lfcs_ia_est.std(0)[subset_genes], row=1, col=2)
# fig.add_trace(
#     go.Scatter(
#         x=[-5, 5],
#         y=[-5, 5],
#         mode="lines",
#         line=dict(color="black", width=4, dash="dash"),
#     ),
#     row=1,
#     col=2,
# )

fig.update_xaxes(title_text="Ground Truth LFC", row=1, col=1)
# fig.update_xaxes(title_text="Ground Truth LFC", row=2, col=1)
fig.update_yaxes(title_text="Predicted LFC", row=1, col=1)
# fig.update_yaxes(title_text="Predicted LFC", row=2, col=1)

fig.update_layout(
    height=600, width=1000, title_text="LFC estimation for {} sample cells B cells/DT cells".format(10)
)

iplot(fig, filename="pbmc_microarray_lfc_with_uncertainty_{}cellsBDT".format(10), sharing="private")

### BDT Other techniques

In [None]:
lfcs_gt = -microarray_info.BDC_logFC

#### Computations

In [None]:
where_a = np.where(y_test == label_a)[0]
where_b = np.where(y_test == label_b)[0]
idx_a = np.random.permutation(where_a)[:100]
idx_b = np.random.permutation(where_b)[:100]

h_a = x_test[idx_a].mean(axis=0)
h_b = x_test[idx_b].mean(axis=0)
lfc_baseline = np.array(np.log2(h_a) - np.log2(h_b))
lfc_baseline = np.clip(lfc_baseline, a_min=-5, a_max=5).squeeze()
lfc_baseline[np.isnan(lfc_baseline)] = 0.0

In [None]:
mast_predictions = other_predictions["mast"]
lfcs_mast = -mast_predictions["lfc"]
stds_mast = np.sqrt(mast_predictions["var_lfc"].squeeze())

lfcs_mast[np.isnan(lfcs_mast)] = 0.0
stds_mast[np.isnan(stds_mast)] = 0.0

In [None]:
lfcs_deseq2 = other_predictions["deseq2"]["lfc"]
lfcs_deseq2[np.isnan(lfcs_deseq2)] = 0.0

lfcs_edger = -other_predictions["edger"]["lfc"]
lfcs_edger[np.isnan(lfcs_edger)] = 0.0

In [None]:
import statsmodels.api as sm
lfcs_mf_100_all
mdls = [
    (-lfcs_mf_100_all.mean(0), "MF"),
    (-lfcs_ia_100_all.mean(0), "IAF"),
    (-lfcs_mast[-1, :], "MAST"),
    (-lfcs_deseq2[-1, :], "DESeq2"),
    (lfcs_edger[-1, :], "EdgeR"),
    (lfc_baseline, "Baseline"),
]


reg_results = dict()
for preds, name in mdls:
    y = preds
    X = microarray_info.BDC_logFC
    X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    display(name, model.summary())
    reg_results[name] = dict(rsquared=model.rsquared_adj, coef=model.params.BDC_logFC)

#### Plots

In [None]:
reg_results.keys()

In [None]:
"{0:.2f}, {1:.2f}".format(1000.01022, 0.0100020)

In [None]:
layout = go.Layout(title_text="LFC point Predictions")
fig = go.Figure(layout=layout)

fig.add_traces(
    [
        go.Scatter(
            x=lfcs_gt[subset_genes],
            y=lfcs_mf_est.mean(0),
            #             error_y=dict(type="data", array=2.0*stds_mast, visible=True),
            mode="markers",
            name="MF @R^2 : {0:.2f}, Slope: {1:.2f}".format(
                reg_results["MF"]["rsquared"], reg_results["MF"]["coef"]
            ),
        ),
        go.Scatter(
            x=lfcs_gt[subset_genes],
            y=lfcs_deseq2[-1, subset_genes],
            mode="markers",
            name="DESeq2 @R^2 : {0:.2f}, Slope: {1:.2f}".format(
                reg_results["DESeq2"]["rsquared"], reg_results["DESeq2"]["coef"]
            ),
        ),
        go.Scatter(
            x=lfcs_gt[subset_genes],
            y=lfc_baseline,
            mode="markers",
            name="Baseline @R^2 : {0:.2f}, Slope: {1:.2f}".format(
                reg_results["Baseline"]["rsquared"], reg_results["Baseline"]["coef"]
            ),
        ),
        go.Scatter(
            x=[-5, 5],
            y=[-5, 5],
            mode="lines",
            line=dict(color="black", width=4, dash="dash"),
            name="Reference",
            showlegend=False
        ),
    ]
)


#         go.Scatter(
#             x=lfcs_gt[subset_genes],
#             y=lfcs_edger[-1, subset_genes],
#             mode="markers",
#             name="EdgeR",
#             text=[
#                 "R^2 : {0:.2f}, Slope: {1:.2f}".format(
#                     reg_results["EdgeR"]["rsquared"], reg_results["EdgeR"]["coef"]
#                 )
#             ],
#         ),

#         go.Scatter(
#             x=lfcs_gt[subset_genes],
#             y=lfcs_mast[-1, subset_genes],
#             #             error_y=dict(type="data", array=2.0*stds_mast, visible=True),
#             mode="markers",
#             name="MAST",
#             text=[
#                 "R^2 : {0:.2f}, Slope: {1:.2f}".format(
#                     reg_results["MAST"]["rsquared"], reg_results["MAST"]["coef"]
#                 )
#             ],
#         ),

fig.show()
# iplot(fig, filename="pbmc_microarray_diags", sharing="private")

**When you take all genes into account, scVI clearly better predicts LFC than its competitors**

### CD: size 100

In [None]:
dataset.de_metadata.info()

In [None]:
dataset.cell_types

In [None]:
def cd_densities(
    filename, mdl_class, dataset, mdl_params, train_params, train_fn_params, sz=SIZE
):
    res = estimate_lfc_density(
        filename=filename,
        mdl_class=mdl_class,
        dataset=dataset,
        mdl_params=mdl_params,
        train_params=train_params,
        train_fn_params=train_fn_params,
        sizes=[sz],
        n_picks=1,
        label_a=1,
        label_b=3,
        n_samples=100
    )[sz].squeeze()
    return res

In [None]:
lfcs_mf = cd_densities(
    filename=os.path.join(DIR_PATH, "cd100MF23.pickle"),
    mdl_class=VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
)

# lfcs_ia = cd_densities(
#     filename=os.path.join(DIR_PATH, "cd100IAF12.pickle"),
#     mdl_class=IAVAE,
#     dataset=dataset,
#     mdl_params=mdl_params["mf"],
#     train_params=train_params["mf"],
#     train_fn_params=train_fn_params["mf"],
# )


In [None]:
subset_genes = np.arange(n_genes)

In [None]:
lfcs_mf_est = lfcs_ia.reshape((-1, n_genes))[:, subset_genes]
lfcs_ia_est = lfcs_mf.reshape((-1, n_genes))[:, subset_genes]
lfcs_gt = - microarray_info.CD_logFC[subset_genes]

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Mean Field", "Inverse Autoregressive Flows"),
    shared_xaxes=True,
    shared_yaxes=True,
)

add_plot(fig, lfcs_mf_est.mean(0), 0.05*lfcs_mf_est.std(0), row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=[-5, 5],
        y=[-5, 5],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=1,
)
# add_plot(fig, lfcs_ia_est.mean(0), 0.05*lfcs_ia_est.std(0), row=1, col=2)
# fig.add_trace(
#     go.Scatter(
#         x=[-5, 5],
#         y=[-5, 5],
#         mode="lines",
#         line=dict(color="black", width=4, dash="dash"),
#     ),
#     row=1,
#     col=2,
# )

fig.update_xaxes(title_text="Ground Truth LFC", row=1, col=1)
fig.update_xaxes(title_text="Ground Truth LFC", row=2, col=1)
fig.update_yaxes(title_text="Predicted LFC", row=1, col=1)
fig.update_yaxes(title_text="Predicted LFC", row=2, col=1)

fig.update_layout(
    height=600, width=1000, title_text="LFC estimation for {} sample CD cells".format(SIZE)
)

# iplot(fig, filename="pbmc_microarray_lfc_with_uncertainty_{}cellsCD".format(SIZE), sharing="private")
fig.show()

### CD: size 10

In [None]:
# lfcs_mf = cd_densities(
#     filename=os.path.join(DIR_PATH, "cd10MF.pickle"),
#     mdl_class=VAE,
#     dataset=dataset,
#     mdl_params=mdl_params["mf"],
#     train_params=train_params["mf"],
#     train_fn_params=train_fn_params["mf"],
#     sz=10
# )

# lfcs_ia = cd_densities(
#     filename=os.path.join(DIR_PATH, "cd10IAF.pickle"),
#     mdl_class=IAVAE,
#     dataset=dataset,
#     mdl_params=mdl_params["mf"],
#     train_params=train_params["mf"],
#     train_fn_params=train_fn_params["mf"],
#     sz=10
# )

In [None]:
# lfcs_mf_est = lfcs_ia.reshape((-1, n_genes))[:, subset_genes]
# lfcs_ia_est = lfcs_mf.reshape((-1, n_genes))[:, subset_genes]
# lfcs_gt = - microarray_info.BDC_logFC[subset_genes]

# fig = make_subplots(
#     rows=1,
#     cols=2,
#     subplot_titles=("Mean Field", "Inverse Autoregressive Flows"),
#     shared_xaxes=True,
#     shared_yaxes=True,
# )

# add_plot(fig, lfcs_mf_est.mean(0), 2.0*lfcs_mf_est.std(0), row=1, col=1)
# fig.add_trace(
#     go.Scatter(
#         x=[-5, 5],
#         y=[-5, 5],
#         mode="lines",
#         line=dict(color="black", width=4, dash="dash"),
#     ),
#     row=1,
#     col=1,
# )
# add_plot(fig, lfcs_ia_est.mean(0), 2.0*lfcs_ia_est.std(0), row=1, col=2)
# fig.add_trace(
#     go.Scatter(
#         x=[-5, 5],
#         y=[-5, 5],
#         mode="lines",
#         line=dict(color="black", width=4, dash="dash"),
#     ),
#     row=1,
#     col=2,
# )

# fig.update_xaxes(title_text="Ground Truth LFC", row=1, col=1)
# fig.update_xaxes(title_text="Ground Truth LFC", row=2, col=1)
# fig.update_yaxes(title_text="Predicted LFC", row=1, col=1)
# fig.update_yaxes(title_text="Predicted LFC", row=2, col=1)

# fig.update_layout(
#     height=600, width=1000, title_text="LFC estimation for {} sample CD cells".format(10)
# )

# iplot(fig, filename="pbmc_microarray_lfc_with_uncertainty_{}cellsCD".format(10), sharing="private")

## Overlap

Voronoi Graph AKA Venn Graph

In [None]:
os.listdir(DIR_PATH)

In [None]:
de_probas_mf = estimate_de_proba(
    filename=os.path.join(DIR_PATH, "de_probas_mfV2.npy"),
    mdl_class=VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
    sizes=[100],
    n_trainings=1,
    n_picks=1,
    label_a=label_a,
    n_samples=300,
    label_b=label_b
).squeeze()

de_probas_iaf = estimate_de_proba(
    filename=os.path.join(DIR_PATH, "de_probas_iafV2.npy"),
    mdl_class=IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"],
    sizes=[100],
    n_trainings=1,
    n_picks=1,
    label_a=label_a,
    n_samples=300,
    label_b=label_b
).squeeze()

In [None]:
is_pred_de_mf = predict_de_genes(de_probas_mf, desired_fdr=Q0)
is_pred_de_iaf = predict_de_genes(de_probas_iaf, desired_fdr=Q0)

In [None]:
print(other_predictions["deseq2"]["pval"].shape, other_predictions["deseq2"]["lfc"].shape)

In [None]:
is_pred_deseq2 = other_predictions["deseq2"]["is_de"]
is_pred_edger = other_predictions["edger"]["is_de"]
is_pred_mast = other_predictions["mast"]["is_de"]

In [None]:
# from matplotlib_venn import venn3

# labels = np.arange(n_genes)
# de_genes_scvi = set(labels[is_pred_de_mf])
# de_genes_deseq2 = set(labels[is_pred_deseq2])
# de_genes_edger = set(labels[is_pred_edger])
# de_genes_mast = set(labels[is_pred_mast])

# venn3(subsets=[de_genes_scvi, de_genes_deseq2, de_genes_mast], set_labels=['scVI', 'DESeq2', "MAST"])
# plt.show()
# venn3(subsets=[de_genes_scvi, de_genes_deseq2, de_genes_edger], set_labels=['scVI', 'DESeq2', "EdgeR"])
# plt.show()

## FDR and FNR (sanity check)

In [None]:
# def get_fnr_fdr(y_true, y_pred):
#     return dict(
#         fnr=(y_true * (~y_pred)).sum() / (y_true).sum(),
#         fdr=((~y_true) * (y_pred)).sum() / (y_pred).sum(),
#     )


# res_df = pd.DataFrame(
#     dict(
#         MF=get_fnr_fdr(is_pred_de_mf, is_significant_de),
#         DESeq2=get_fnr_fdr(is_pred_deseq2, is_significant_de),
#         EdgeR=get_fnr_fdr(is_pred_edger, is_significant_de),
#         MAST=get_fnr_fdr(is_pred_mast, is_significant_de),
#     )
# ).T

# res_df.plot.bar()

## PR Curves (sanity check)

In [None]:
from sklearn.metrics import precision_recall_curve

preds_mf = de_probas_mf
preds_iaf = de_probas_iaf
preds_deseq2 = -other_predictions['deseq2']['pval'][0, :]
preds_edger = -other_predictions['edger']['pval'][0, :]
preds_mast = -other_predictions['mast']['pval'][0, :]

In [None]:
dataset.de_metadata.info()

In [None]:
is_significant_de = (dataset.de_metadata["BDC_adj.P.Val"] <= Q0) 
# * (dataset.de_metadata["BDC_logFC"].abs() >= DELTA)

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

def plot_pr(fig, preds, y_true, name):
    average_precision = average_precision_score(y_true, preds)
    preds[np.isnan(preds)] = np.min(preds[~np.isnan(preds)])
    precs, recs, _ = precision_recall_curve(y_true=y_true, probas_pred=preds)
    fig.add_trace(
        go.Scatter(
            x=recs,
            y=precs,
            name=name+'@AP: {0:0.2f}'.format(average_precision)
        )
    )
    return
layout = go.Layout(
    title='Precision Recall Curves',
    xaxis=dict(title='Recall'),
    yaxis=dict(title='Precision'),
    width=800,
    height=600,
)
fig = go.Figure(layout=layout)
plot_pr(fig=fig, preds=preds_mf, y_true=is_significant_de, name='MF')
plot_pr(fig=fig, preds=preds_iaf, y_true=is_significant_de, name='IAF')
plot_pr(fig=fig, preds=preds_deseq2, y_true=is_significant_de, name='DESeq2')
plot_pr(fig=fig, preds=preds_edger, y_true=is_significant_de, name='EdgeR')
plot_pr(fig=fig, preds=preds_mast, y_true=is_significant_de, name='MAST')

iplot(fig, filename="pbmc_microarray_pr_curves", sharing="private")

## Robustness

As Usual Graph

## Concordance

In [None]:
from sklearn.metrics import precision_recall_curve

preds_mf = de_probas_mf
preds_iaf = de_probas_iaf
preds_deseq2 = -other_predictions['deseq2']['pval'][0, :]
preds_edger = -other_predictions['edger']['pval'][0, :]
preds_mast = -other_predictions['mast']['pval'][0, :]

In [None]:
print(preds_mf.shape)
print(preds_iaf.shape)
print(preds_deseq2.shape)
print(preds_edger.shape)
print(preds_mast.shape)

### K best

In [None]:
K = 100

def get_K_best(preds):
    sorted_best =  np.argsort(-preds) # From highest to lowest scores
    k_best = sorted_best[:K]
    return k_best

best_mf = get_K_best(preds_mf)
best_iaf = get_K_best(preds_iaf)
best_deseq2 = get_K_best(preds_deseq2)
best_edger = get_K_best(preds_edger)
best_mast = get_K_best(preds_mast)

def get_aucc_couple(best1, best2, k_val=K):
    k_vals = np.arange(1, k_val)
    concordances = []
    for k in k_vals:
        common_genes = len(np.intersect1d(best1[:k], best2[:k]))
        concordances.append(common_genes)
    concordances = np.array(concordances)
    aucc = concordances.sum() / (k_val*k_val/2)
    return aucc

print(np.arange(K).sum() / (K*K/2))  # Ensure normalization OK

concs_mat = np.eye(5)
methods = [
    best_mf,
    best_iaf,
    best_deseq2,
    best_edger,
    best_mast,
]
labels = [
    "MF",
    "IAF",
    "DESeq2",
    "EdgeR",
    "MAST",
]
for (idx_a, method_a) in enumerate(tqdm_notebook(methods)):
    for (idx_b, method_b) in enumerate(methods):
        if idx_a == idx_b:
            continue
        elif idx_b <= idx_a:
            continue
        aucc = get_aucc_couple(method_a, method_b)
        concs_mat[idx_a, idx_b] = aucc
        concs_mat[idx_b, idx_a] = aucc

In [None]:
# import plotly.figure_factory as ff


# ff.create_dendrogram(X=concs_mat, labels=labels)

In [None]:
# X = np.random.rand(10, 1)
# names = ['Jack', 'Oxana', 'John', 'Chelsea', 'Mark', 'Alice', 'Charlie', 'Rob', 'Lisa', 'Lily']
# fig = ff.create_dendrogram(X, orientation='left', labels=names)
# fig.update_layout(width=800, height=800)
# fig.show()

In [None]:
import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(concs_mat, x=labels, y=labels, colorscale='Viridis', showscale=True)
fig.show()

### Significance

In [None]:
best_mf = np.argsort(-preds_mf)
best_iaf = np.argsort(-preds_iaf)
best_deseq2 = np.argsort(-preds_deseq2)
best_edger = np.argsort(-preds_edger)
best_mast = np.argsort(-preds_mast)

concs_mat = np.eye(5)
methods = [
    best_mf,
    best_iaf,
    best_deseq2,
    best_edger,
    best_mast,
]
labels = [
    "MF",
    "IAF",
    "DESeq2",
    "EdgeR",
    "MAST",
]
for (idx_a, method_a) in enumerate(tqdm_notebook(methods)):
    for (idx_b, method_b) in enumerate(methods):
        if idx_a == idx_b:
            continue
        elif idx_b <= idx_a:
            continue
        aucc = get_aucc_couple(method_a, method_b, k_val=n_genes)
        concs_mat[idx_a, idx_b] = aucc
        concs_mat[idx_b, idx_a] = aucc

In [None]:
import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(concs_mat, x=labels, y=labels, colorscale='Viridis', showscale=True)
fig.show()

# Debug

In [None]:
dataset.de_metadata.info()

In [None]:
mf, mf_trainer = train_model(
    mdl_class=VAE,
    dataset=dataset,
    mdl_params=mdl_params["mf"],
    train_params=train_params["mf"],
    train_fn_params=train_fn_params["mf"],
)
iaf, iaf_trainer = train_model(
    mdl_class=IAVAE,
    dataset=dataset,
    mdl_params=mdl_params["iaf"],
    train_params=train_params["iaf"],
    train_fn_params=train_fn_params["iaf"]
)


In [None]:
z_iaf, labels_iaf, scales_iaf = iaf_trainer.test_set.get_latents(n_samples=100, other='scales', device="cpu")
from scvi.utils import plot_identity


In [None]:
dataset.cell_types

In [None]:
lfcs_gt = - microarray_info.BDC2_logFC[subset_genes]
where_a = np.where(labels_iaf == label_a)[0][:20]
where_b = np.where(labels_iaf == label_b)[0][:20]

scales_a = scales_iaf[:, where_a, :]
scales_b = scales_iaf[:, where_b, :]

lfc = np.log2(scales_a) - np.log2(scales_b)
lfc = lfc.mean((0, 1))
lfc = np.array(lfc)
plt.scatter(x=lfc, y=lfcs_gt)plot_identity()
plt.show()

In [None]:
from scvi.utils import demultiply

In [None]:
where_a = np.where(labels_iaf == 2)[0]
len(where_a)

In [None]:
where_b = np.where(labels_iaf == 3)[0]
len(where_b)


In [None]:
lfcs_gt = - microarray_info.CD_logFC[subset_genes]
where_a = np.where(labels_iaf == 2)[0][:300]
where_b = np.where(labels_iaf == 3)[0][:300]

scales_a = scales_iaf[:, where_a, :]
scales_b = scales_iaf[:, where_b, :]

scales_a, scales_b = demultiply(scales_a, scales_b, factor=2)

lfc = np.log2(scales_a) - np.log2(scales_b)
lfc = lfc.mean((0, 1))
lfc = np.array(lfc)
plt.scatter(x=lfc, y=lfcs_gt)
plot_identity()

In [None]:
import statsmodels.api as sm

y = lfc
X = lfcs_gt
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
model.summary()

x_lin = np.linspace(-4, 4)
const = model.params.const
y_lin = model.params.CD_logFC * x_lin + const

plt.scatter(x=lfc, y=lfcs_gt)

plt.plot(x_lin, y_lin)
plot_identity()
plt.show()

In [None]:
model.params

In [None]:
model.summary()

In [None]:
dataset.cell_types

In [None]:
lfcs_gt = - microarray_info.CD_logFC[subset_genes]

for label_a in range(9):
    for label_b in range(9):
        if label_a == label_b:
            continue

        where_a = np.where(labels_iaf == label_a)[0][:20]
        where_b = np.where(labels_iaf == label_b)[0][:20]


        scales_a = scales_iaf[:, where_a, :]
        scales_b = scales_iaf[:, where_b, :]

        lfc = np.log2(scales_a) - np.log2(scales_b)
        lfc = lfc.mean((0, 1))
        lfc = np.array(lfc)
        plt.scatter(x=lfc, y=lfcs_gt)
        plt.title('a={}, b={}'.format(label_a, label_b))
        plot_identity()
        plt.show()

In [None]:
lfcs_gt = - microarray_info.BDC2_logFC[subset_genes]

for label_a in range(9):
    for label_b in range(9):
        if label_a == label_b:
            continue

        where_a = np.where(labels_iaf == label_a)[0][:20]
        where_b = np.where(labels_iaf == label_b)[0][:20]


        scales_a = scales_iaf[:, where_a, :]
        scales_b = scales_iaf[:, where_b, :]

        lfc = np.log2(scales_a) - np.log2(scales_b)
        lfc = lfc.mean((0, 1))
        lfc = np.array(lfc)
        plt.scatter(x=lfc, y=lfcs_gt)
        plt.title('a={}, b={}'.format(label_a, label_b))
        plot_identity()
        plt.show()