In [1]:
import glob
import pandas as pd
import numpy as np
import os
from IPython.display import HTML

In [2]:
!ls /home/pierre/scVI/results/models_comparison/oscar_runs/

brain_small			  cortex	    nb_dataset
corr_nb_dataset			  hemato	    pbmc
corr_zinb_dataset		  mixed_25_dataset  retina
corr_zinb_dataset_strong	  mixed_50_dataset  zinb_dataset
corr_zinb_dataset_strong_no_tune  mixed_75_dataset


In [3]:
data_path = '/home/pierre/scVI/results/models_comparison/oscar_runs'
dataset_name = 'hemato'
data_files = sorted(glob.glob(os.path.join(data_path, '{}/*.csv'.format(dataset_name))))
for data_file in data_files:
    print(data_file)
data_names = ['nb', 'zifa_full', 'zinb']

/home/pierre/scVI/results/models_comparison/oscar_runs/hemato/nb_hemato.csv
/home/pierre/scVI/results/models_comparison/oscar_runs/hemato/zifa_full_hemato.csv
/home/pierre/scVI/results/models_comparison/oscar_runs/hemato/zinb_hemato.csv


In [4]:
dfs = []
for data_name, f in zip(data_names, data_files):
    my_df = pd.read_csv(f, sep='\t')
    my_df.loc[:, 'model'] = data_name
    dfs.append(my_df)
df = pd.concat(dfs, axis=0)

In [5]:
metrics = ['ll_ll', 'imputation_median_imputation_score', 't_dropout_ks_stat', 't_ratio_ks_stat', 't_cv_ks_stat']
h1_hypothesis = ['larger', 'larger', 'larger', 'larger', 'larger']
h1_hypothesis_bis = ['smaller', 'smaller', 'smaller', 'smaller', 'smaller']

df_nb = df.loc[df.model=='nb', metrics]
df_zinb = df.loc[df.model=='zinb', metrics]


from statsmodels.stats.weightstats import ttest_ind

def get_pvals(gby, other_df):
    my_df = gby[metrics]
    assert my_df.shape[1] == len(metrics)
    pvals = []
    for idx, alternative in enumerate(h1_hypothesis):
        assert len(other_df.values[:, idx]) != len(h1_hypothesis), (len(other_df.values[:, idx]), len(h1_hypothesis))
        _, pval, _ = ttest_ind(other_df.values[:, idx], my_df.values[:, idx], alternative=alternative)
        pvals.append(pval)
    return np.array(pvals)

pvals_against_zinb = df.groupby('model').apply(get_pvals, other_df=df_zinb)
pvals_against_zinb = (pvals_against_zinb
         .apply(lambda x: pd.Series(x))
         .T)
pvals_against_zinb = pvals_against_zinb.rename(index={idx: met for (idx,met) in enumerate(metrics)})

pvals_against_nb = df.groupby('model').apply(get_pvals, other_df=df_nb)
pvals_against_nb = (pvals_against_nb
         .apply(lambda x: pd.Series(x))
         .T)
pvals_against_nb = pvals_against_nb.rename(index={idx: met for (idx,met) in enumerate(metrics)})

In [6]:
a = df.loc[df.model=='zifa_full', 'imputation_median_imputation_score']
b = df.loc[df.model=='zinb', 'imputation_median_imputation_score']
print(a.mean(), b.mean())
ttest_ind(a, b, alternative='smaller')

0.7520176231861114 0.7392812728881836


(1.2821455294853252, 0.8919702261801644, 18.0)

In [7]:
pvals_against_zinb

model,nb,zifa_full,zinb
ll_ll,5.763735e-18,1.008908e-18,0.5
imputation_median_imputation_score,0.9552212,0.8919702,0.5
t_dropout_ks_stat,0.1481071,0.02466888,0.5
t_ratio_ks_stat,0.4384888,0.09887299,0.5
t_cv_ks_stat,9.6332e-06,7.413363e-06,0.5


In [10]:
def get_summary(gby):
    res = {}
    res['mean'] = gby.mean()
    res['std'] = gby.std()
    res['pvals_against_nb'] = pvals_against_nb[gby.name]
    res['pvals_against_zi'] = pvals_against_zinb[gby.name]
    return pd.DataFrame(res).T

# df_summary = df.groupby('model')['ll_ll', 'imputation_median_imputation_score', 't_dropout_ks_stat',
#                                  't_ratio_ks_stat', 't_cv_ks_stat'].agg(['mean', 'std']).T
df_summary = df.groupby('model')[metrics].apply(get_summary)
df_summary


def my_styler(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
#     print(val)
    style = ['background-color: yellow' if v < 0.05 else '' for v in val]
    return style

df_summary = df_summary.stack().unstack(1).sort_index(level=1).swaplevel()
style = df_summary.style.apply(my_styler, subset=['pvals_against_nb', 'pvals_against_zi'], axis=1)
style
# df_summary.loc[pd.IndexSlice[pd.IndexSlice[:, 'pvals'], :]].style.background_gradient()
# df_summary.style.apply(my_styler, subset=pd.IndexSlice[:, 'pvals'], axis=0)
# df_summary = df_summary.style.apply(my_styler, subset=pd.IndexSlice[pd.IndexSlice[:, 'pvals'], :], axis=1).render()
# df_summary.pivot(index=['model', ['mean', 'std', 'pvals']], columns=metrics)
# style

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,pvals_against_nb,pvals_against_zi
Unnamed: 0_level_1,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
imputation_median_imputation_score,nb,0.756274,0.0206598,0.5,0.955221
imputation_median_imputation_score,zifa_full,0.752018,0.0227349,0.333255,0.89197
imputation_median_imputation_score,zinb,0.739281,0.021677,0.0447788,0.5
ll_ll,nb,905.122,3.99411,0.5,5.76373e-18
ll_ll,zifa_full,904.77,2.00317,0.40328,1.00891e-18
ll_ll,zinb,993.396,7.32048,1.0,0.5
t_cv_ks_stat,nb,0.20475,0.0993521,0.5,9.6332e-06
t_cv_ks_stat,zifa_full,0.262333,0.0633221,0.930198,7.41336e-06
t_cv_ks_stat,zinb,0.39475,0.0329376,0.99999,0.5
t_dropout_ks_stat,nb,0.17625,0.12616,0.5,0.148107


In [9]:
# save as png
import imgkit
imgkit.from_string(style, os.path.join(data_path, '{}.png'.format(dataset_name)))

TypeError: expected string or bytes-like object

In [None]:
# save as latex table

In [None]:
text_file = open(os.path.join(data_path, 'latex_table.txt'), "w")
text_file.write(df_summary.to_latex())
text_file.close()