In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly as py
import pandas as pd
from chart_studio.plotly import plot, iplot

# from plotly.offline import init_notebook_mode, iplot
from tqdm import tqdm_notebook

from scvi.dataset import PbmcDataset
from scvi.models import VAE, IAVAE
from scvi.inference import UnsupervisedTrainer
from scvi.utils import demultiply, make_dir_if_necessary, predict_de_genes
from scvi_utils import estimate_de_proba, estimate_lfc_density, estimate_lfc_mean
from R_interop import all_predictions


N_EPOCHS = 100
DELTA = 0.5
# SIZES = [5, 10, 20, 30, 50, 100]
SIZE = 100
SIZES = [SIZE]
N_SIZES = len(SIZES)

Q0 = 5e-2
N_TRAININGS = 1
N_PICKS = 10

np.random.seed(42)

DIR_PATH = 'lfc_estimates/pbmc'
make_dir_if_necessary(DIR_PATH)

# Import Dataset

In [None]:
DIR_PATH = "lfc_estimates/pbmc"
make_dir_if_necessary(DIR_PATH)

In [None]:
dataset = PbmcDataset()

unique_elements, counts_elements = np.unique(
    dataset.labels.squeeze(), return_counts=True
)

df = pd.DataFrame(dict(counts=counts_elements, cell_types=dataset.cell_types))
px.scatter(df, y="counts", x="cell_types")

n_genes = dataset.nb_genes

In [4]:
print(dataset.cell_types)

['B cells' 'CD14+ Monocytes' 'CD4 T cells' 'CD8 T cells' 'Dendritic Cells'
 'FCGR3A+ Monocytes' 'Megakaryocytes' 'NK cells' 'Other']


In [5]:
dataset.gene_names

array(['ENSG00000188976', 'ENSG00000187608', 'ENSG00000149527', ...,
       'ENSG00000160299', 'ENSG00000160305', 'ENSG00000160307'],
      dtype='<U64')

In [6]:
microarray_info = dataset.de_metadata.set_index('ENSG')
microarray_info = microarray_info.loc[dataset.gene_names]

In [7]:
display(dataset.de_metadata.head())
print(dataset.de_metadata.info())

Unnamed: 0.1,Unnamed: 0,ENSG,GS,CD_logFC,CD_AveExpr,CD_t,CD_P.Value,CD_adj.P.Val,CD_B,BDC_logFC,...,BDC_t,BDC_P.Value,BDC_adj.P.Val,BDC_B,BDC2_logFC,BDC2_AveExpr,BDC2_t,BDC2_P.Value,BDC2_adj.P.Val,BDC2_B
0,5,ENSG00000188976,NOC2L,0.248976,4.19323,1.308282,0.201059,0.478547,-5.465606,0.023141,...,0.199196,0.844018,0.936848,-6.964569,0.532702,5.985696,1.671825,0.116222,0.148453,-6.166109
1,7,ENSG00000187608,ISG15,-0.036463,5.769094,-0.21699,0.829737,0.919775,-6.27281,0.987879,...,3.449654,0.002391,0.019573,-2.095965,0.933572,6.303954,2.202136,0.044485,0.062851,-5.300666
2,36,ENSG00000149527,PLCH2,0.5056,3.225463,1.834575,0.076854,0.327638,-4.707168,-0.119499,...,-1.014781,0.321715,0.598441,-6.463128,-0.977612,4.232389,-3.119639,0.007349,0.012455,-3.569136
3,37,ENSG00000157881,PANK4,-0.093142,4.319701,-0.772409,0.446126,0.686981,-6.001137,0.024259,...,0.111785,0.912052,0.963969,-6.978668,0.183735,6.453772,1.306024,0.2121,0.252955,-6.668252
4,40,ENSG00000157873,TNFRSF14,0.073292,4.949553,0.353608,0.726192,0.865967,-6.233949,-0.377352,...,-2.465162,0.022361,0.106855,-4.215261,-0.075492,6.920026,-0.540324,0.597263,0.639609,-7.373536


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3346 entries, 0 to 3345
Data columns (total 21 columns):
Unnamed: 0        3346 non-null int64
ENSG              3346 non-null object
GS                3346 non-null object
CD_logFC          3346 non-null float64
CD_AveExpr        3346 non-null float64
CD_t              3346 non-null float64
CD_P.Value        3346 non-null float64
CD_adj.P.Val      3346 non-null float64
CD_B              3346 non-null float64
BDC_logFC         3346 non-null float64
BDC_AveExpr       3346 non-null float64
BDC_t             3346 non-null float64
BDC_P.Value       3346 non-null float64
BDC_adj.P.Val     3346 non-null float64
BDC_B             3346 non-null float64
BDC2_logFC        3346 non-null float64
BDC2_AveExpr      3346 non-null float64
BDC2_t            3346 non-null float64
BDC2_P.Value      3346 non-null float64
BDC2_adj.P.Val    3346 non-null float64
BDC2_B            3346 non-null float64
dtypes: float64(18), int64(1), object(2)
memory usage: 54

In [8]:
# n_examples = len(dataset)
# labels = dataset.labels.squeeze()
# interesting_indices = np.where((labels == 0) | (labels == 2))[0]
# TEST_INDICES = np.random.permutation(interesting_indices)[:1001]
TEST_INDICES = np.random.permutation(len(dataset))[:1500]

x_test, y_test = dataset.X[TEST_INDICES, :], dataset.labels[TEST_INDICES, :].squeeze()
data_path = os.path.join(DIR_PATH, 'data.npy')
labels_path = os.path.join(DIR_PATH, 'labels.npy')

np.save(
    data_path,
    np.array(x_test.todense()).squeeze().astype(int)
)
np.savetxt(
    labels_path,
    y_test.squeeze()
)

# Competitors


In [None]:
other_predictions = all_predictions(
    n_genes=n_genes, 
    n_picks=N_PICKS, 
    sizes=SIZES, 
    data_path=data_path, 
    labels_path=labels_path
)

# Experiments

## Microarray

In [9]:
label_a = 0
label_b = 4

In [30]:
# lfcs_mf = estimate_lfc_density(
#     VAE,
#     dataset=dataset,
#     mdl_params=dict(n_hidden=128, n_layers=1, n_latent=5),
#     train_params=dict(ratio_loss=True, test_indices=TEST_INDICES),
#     train_fn_params=dict(n_epochs=100, lr=1e-3),
#     sizes=[SIZE],
#     n_picks=1,
#     label_a=label_a,
#     label_b=label_b
# )[SIZE].squeeze()

lfcs_ia = estimate_lfc_density(
    IAVAE,
    dataset=dataset,
    mdl_params=dict(n_hidden=128, n_layers=1, do_h=True, n_latent=5, t=4),
    train_params=dict(ratio_loss=True, test_indices=TEST_INDICES),
    train_fn_params=dict(n_epochs=N_EPOCHS, lr=1e-3),
    sizes=[SIZE],
    n_picks=1,
    label_a=label_a,
    label_b=label_b
)[SIZE].squeeze()


EXPERIMENTAL: Posterior functionalities may not be working

INFO:scvi.models.iaf_encoder:Using Hidden State


training: 100%|██████████| 100/100 [03:09<00:00,  1.99s/it]
[8179.638957698171, 3031.582069955221, 1588.596713926734, 1514.8361920612615, 1497.5371212842988, 1481.3574620688835, 1472.2220086818788, 1469.0370007026486, 1459.5982204530296, 1454.2613808236472, 1447.9515544612234, 1445.908780725991, 1439.9637004573171, 1435.943389148247, 1431.1085904749427, 1428.8613906488185, 1428.6887445217226, 1423.503910715987, 1419.3284018911966, 1416.658759884718, 1413.6470694193026, 1415.9654704768484, 1407.8170850800304, 1404.8763353301258, 1403.4798956149962, 1402.227055247237, 1401.5968106897865, 1395.7811636575839, 1393.6676680402059, 1391.2130930830792, 1388.227311297161, 1391.8412981731135, 1385.971611209032, 1389.1702999952363, 1385.7031517959222, 1382.3956611447218, 1381.5522833103087, 1380.05598114758, 1379.2086032774391, 1374.2724296755907, 1374.0805723608994, 1372.9030985018103, 1370.449587938262, 1369.4242836556784, 1368.3873975800304, 1366.7373672113185, 1364.4972266220466, 1363.7375532

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [27]:
from sklearn.preprocessing import StandardScaler

new_n_genes = 500

std_scaler = StandardScaler(with_mean=False)
std_scaler.fit(dataset.X.astype(np.float64))
subset_genes = np.argsort(std_scaler.var_)[::-1][:new_n_genes]

In [37]:
print(lfcs_mf_est.shape)
print(lfcs_ia_est.shape)
print(lfcs_gt.shape)

(500, 3346)
(500, 3346)
(500,)


In [38]:
from plotly.subplots import make_subplots

lfcs_mf_est = lfcs_ia.reshape((-1, n_genes))[:, subset_genes]
lfcs_ia_est = lfcs_mf.reshape((-1, n_genes))[:, subset_genes]
lfcs_gt = - microarray_info.BDC_logFC[subset_genes]

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Mean Field", "Inverse Autoregressive Flows"),
    shared_xaxes=True,
    shared_yaxes=True,
)


def add_plot(fig, lfcs_est_m, lfcs_est_err, row, col):
    fig.add_trace(
        go.Scatter(
            x=lfcs_gt,
            y=lfcs_est_m,
            error_y=dict(type="data", array=lfcs_est_err, visible=True),
            mode="markers",
        ),
        row=row,
        col=col,
    )
    return


add_plot(fig, lfcs_mf_est.mean(0), lfcs_mf_est.std(0), row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=[-3, 3],
        y=[-3, 3],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=1,
)
add_plot(fig, lfcs_ia_est.mean(0), lfcs_ia_est.std(0), row=1, col=2)
fig.add_trace(
    go.Scatter(
        x=[-3, 3],
        y=[-3, 3],
        mode="lines",
        line=dict(color="black", width=4, dash="dash"),
    ),
    row=1,
    col=2,
)

fig.update_xaxes(title_text="Ground Truth LFC", row=1, col=1)
fig.update_xaxes(title_text="Ground Truth LFC", row=2, col=1)
fig.update_xaxes(title_text="Predicted LFC", row=1, col=1)
fig.update_xaxes(title_text="Predicted LFC", row=2, col=1)

fig.update_layout(
    height=600, width=1000, title_text="LFC estimation for {} sample cells".format(SIZE)
)
fig.show()

In [20]:
import statsmodels.api as sm
# 'BDC_logFC', y='lfc_MF'
y = microarray_info.lfc_MF
X = microarray_info.BDC_logFC

model = sm.OLS(y, X).fit()

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,lfc_MF,R-squared (uncentered):,0.498
Model:,OLS,Adj. R-squared (uncentered):,0.498
Method:,Least Squares,F-statistic:,3322.0
Date:,"Thu, 01 Aug 2019",Prob (F-statistic):,0.0
Time:,01:03:25,Log-Likelihood:,-4295.2
No. Observations:,3346,AIC:,8592.0
Df Residuals:,3345,BIC:,8599.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
BDC_logFC,1.0486,0.018,57.637,0.000,1.013,1.084

0,1,2,3
Omnibus:,314.86,Durbin-Watson:,1.856
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1812.619
Skew:,0.242,Prob(JB):,0.0
Kurtosis:,6.573,Cond. No.,1.0


## Overlap

Voronoi Graph

## Robustness

As Usual Graph