# Differential Analysis - Compare model imputation with standard imputation

- load real NA predictions
- leave all other values as they were
- compare real NA predicition by model with standard method (draw from shifted normal distribution)

In [None]:
from pathlib import Path
from collections import namedtuple

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import pingouin as pg

import statsmodels.stats.multitest


import vaep
import vaep.analyzers
import vaep.io.datasplits
import vaep.imputation
import vaep.stats

import vaep.nb

logger = vaep.logging.setup_nb_logger()

In [None]:
# catch passed parameters
args = None
args = dict(globals()).keys()

## Parameters

In [None]:
folder_experiment = "runs/appl_ald_data/proteinGroups"
folder_data: str = ''  # specify data directory if needed
fn_rawfile_metadata = "data/single_datasets/raw_meta.csv"
fn_clinical_data = "data/single_datasets/ald_metadata_cli.csv"
target: str = 'kleiner'
covar:str = 'age,bmi,gender_num,nas_steatosis_ordinal'

file_format = "pkl"
model_key = 'vae'
value_name='intensity'

In [None]:
params = vaep.nb.get_params(args, globals=globals(), remove=True)
params

In [None]:
args = vaep.nb.Config()
args.fn_rawfile_metadata = Path(params["fn_rawfile_metadata"])
args.fn_clinical_data = Path(params["fn_clinical_data"])
args.folder_experiment = Path(params["folder_experiment"])
args = vaep.nb.add_default_paths(args, folder_data=params["folder_data"])
args.covar = params["covar"].split(',')
args.update_from_dict(params)
args

# Data

## MS proteomics

In [None]:
data = vaep.io.datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)

In [None]:
observed = pd.concat([data.train_X, data.val_y, data.test_y])
observed

## Clinical data

In [None]:
# covar = ['age', 'bmi', 'gender_num', 'abstinent_num', 'nas_steatosis_ordinal']
# covar_steatosis = ['age', 'bmi', 'gender_num', 'abstinent_num', 'kleiner', 'nas_inflam']

In [None]:
df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0)
df_clinic = df_clinic.loc[observed.index.levels[0]]
cols_clinic = vaep.pandas.get_columns_accessor(df_clinic)
df_clinic.describe()

## ALD study approach using all measurments

In [None]:
DATA_COMPLETENESS = 0.6
MIN_N_PROTEIN_GROUPS: int = 200
FRAC_PROTEIN_GROUPS: int = 0.622

ald_study, cutoffs = vaep.analyzers.diff_analysis.select_raw_data(observed.unstack(
), data_completeness=DATA_COMPLETENESS, frac_protein_groups=FRAC_PROTEIN_GROUPS)

ald_study

In [None]:
freq_feat = observed.unstack().notna().sum()
freq_feat.name = 'frequency'
fname = args.folder_experiment / 'freq_features_observed.csv'
logger.info(fname)
freq_feat.to_csv(fname)
freq_feat

In [None]:
vaep.plotting.plot_cutoffs(observed.unstack(), feat_completness_over_samples=cutoffs.feat_completness_over_samples,
             min_feat_in_sample=cutoffs.min_feat_in_sample)

In [None]:
pred_real_na_imputed_normal = vaep.imputation.impute_shifted_normal(
    ald_study)

## load model predictions for (real) missing data

In [None]:
list(args.out_preds.iterdir())

In [None]:
template = 'pred_real_na_{}.csv'
fname = args.out_preds / template.format(args.model_key)
fname

In [None]:
pred_real_na = vaep.analyzers.compare_predictions.load_single_csv_pred_file(fname)
pred_real_na.sample(3)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 15), sharex=True)
ax = axes[1]
ax = pred_real_na.hist(ax=ax)
ax.set_title(f'real na imputed using {args.model_key}')
ax.set_ylabel('count measurments')

ax = axes[0]
ax = observed.hist(ax=ax)
ax.set_title('observed measurments')
ax.set_ylabel('count measurments')

ax = axes[2]
ax = pred_real_na_imputed_normal.hist(ax=ax)
ax.set_title(f'real na imputed using shifted normal distribution')
ax.set_ylabel('count measurments')

vaep.savefig(fig, name=f'real_na_obs_vs_default_vs_{args.model_key}', folder=args.out_figures)

# Differential analysis

## Model imputation

In [None]:
df = pd.concat([observed, pred_real_na]).unstack()
df

In [None]:
assert df.isna().sum().sum() == 0, "DataFrame has missing entries"

Targets - Clinical variables

In [None]:
scores = vaep.stats.diff_analysis.analyze(df_proteomics=df,
        df_clinic=df_clinic,
        target=args.target,
        covar=args.covar,
        value_name=value_name)

scores.columns = pd.MultiIndex.from_product([[args.model_key], scores.columns],
                                            names=('model', 'var'))
scores

## Shifted normal distribution

In [None]:
df = pd.concat([ald_study.stack(), pred_real_na_imputed_normal]).unstack()
df

In [None]:
_scores = vaep.stats.diff_analysis.analyze(df_proteomics=df,
        df_clinic=df_clinic,
        target=args.target,
        covar=args.covar,
        value_name=value_name)
_scores.columns = pd.MultiIndex.from_product([['random shifted_imputation'], _scores.columns],
                                            names=('model', 'var'))
_scores

# Combine scores

In [None]:
scores=scores.join(_scores)

In [None]:
scores.describe()

In [None]:
fname = args.folder_experiment/f'diff_analysis_scores_{args.model_key}.pkl'
scores.to_pickle(fname)
fname