# Compare predictions between model and RSN

- see differences in imputation for diverging cases
- dumps top5

In [1]:
import logging
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
import njab
import pandas as pd
import seaborn

import pimmslearn
import pimmslearn.analyzers
import pimmslearn.imputation
import pimmslearn.io.datasplits

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)

plt.rcParams['figure.figsize'] = [4, 2.5]  # [16.0, 7.0] , [4, 3]
pimmslearn.plotting.make_large_descriptors(7)

# catch passed parameters
args = None
args = dict(globals()).keys()

## Parameters

In [2]:
folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups'
fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv"
make_plots = True  # create histograms and swarmplots of diverging results
model_key = 'VAE'
sample_id_col = 'Sample ID'
target = 'kleiner'
cutoff_target: int = 2  # => for binarization target >= cutoff_target
out_folder = 'diff_analysis'
file_format = 'csv'
baseline = 'RSN'  # default is RSN, but could be any other trained model
template_pred = 'pred_real_na_{}.csv'  # fixed, do not change
ref_method_score = None  # filepath to reference method score

In [3]:
# Parameters
cutoff_target = 0.5
make_plots = False
ref_method_score = None
folder_experiment = "runs/alzheimer_study"
target = "AD"
baseline = "PI"
out_folder = "diff_analysis"
fn_clinical_data = "runs/alzheimer_study/data/clinical_data.csv"


In [4]:
params = pimmslearn.nb.get_params(args, globals=globals())
args = pimmslearn.nb.Config()
args.folder_experiment = Path(params["folder_experiment"])
args = pimmslearn.nb.add_default_paths(args,
                                 out_root=(args.folder_experiment
                                           / params["out_folder"]
                                           / params["target"]))
args.folder_scores = (args.folder_experiment
                      / params["out_folder"]
                      / params["target"]
                      / 'scores'
                      )
args.update_from_dict(params)
args

root - INFO     Removed from global namespace: folder_experiment


root - INFO     Removed from global namespace: fn_clinical_data


root - INFO     Removed from global namespace: make_plots


root - INFO     Removed from global namespace: model_key


root - INFO     Removed from global namespace: sample_id_col


root - INFO     Removed from global namespace: target


root - INFO     Removed from global namespace: cutoff_target


root - INFO     Removed from global namespace: out_folder


root - INFO     Removed from global namespace: file_format


root - INFO     Removed from global namespace: baseline


root - INFO     Removed from global namespace: template_pred


root - INFO     Removed from global namespace: ref_method_score


root - INFO     Already set attribute: folder_experiment has value runs/alzheimer_study


root - INFO     Already set attribute: out_folder has value diff_analysis


{'baseline': 'PI',
 'cutoff_target': 0.5,
 'data': PosixPath('runs/alzheimer_study/data'),
 'file_format': 'csv',
 'fn_clinical_data': 'runs/alzheimer_study/data/clinical_data.csv',
 'folder_experiment': PosixPath('runs/alzheimer_study'),
 'folder_scores': PosixPath('runs/alzheimer_study/diff_analysis/AD/scores'),
 'make_plots': False,
 'model_key': 'VAE',
 'out_figures': PosixPath('runs/alzheimer_study/figures'),
 'out_folder': PosixPath('runs/alzheimer_study/diff_analysis/AD'),
 'out_metrics': PosixPath('runs/alzheimer_study'),
 'out_models': PosixPath('runs/alzheimer_study'),
 'out_preds': PosixPath('runs/alzheimer_study/preds'),
 'ref_method_score': None,
 'sample_id_col': 'Sample ID',
 'target': 'AD',
 'template_pred': 'pred_real_na_{}.csv'}

Write outputs to excel

In [5]:
files_out = dict()

fname = args.out_folder / 'diff_analysis_compare_DA.xlsx'
writer = pd.ExcelWriter(fname)
files_out[fname.name] = fname.as_posix()
logger.info("Writing to excel file: %s", fname)

root - INFO     Writing to excel file: runs/alzheimer_study/diff_analysis/AD/diff_analysis_compare_DA.xlsx


## Load scores
List dump of scores:

In [6]:
score_dumps = [fname for fname in Path(
    args.folder_scores).iterdir() if fname.suffix == '.pkl']
score_dumps

[PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_RF.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_DAE.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_TRKNN.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_QRILC.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_PI.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_None.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_VAE.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_Median.pkl'),
 PosixPath('runs/alzheimer_study/diff_analysis/AD/scores/diff_analysis_scores_CF.pkl')]

Load scores from dumps:

In [7]:
scores = pd.concat([pd.read_pickle(fname) for fname in score_dumps], axis=1)
scores

Unnamed: 0_level_0,model,RF,RF,RF,RF,RF,RF,RF,RF,DAE,DAE,...,Median,Median,CF,CF,CF,CF,CF,CF,CF,CF
Unnamed: 0_level_1,var,SS,DF,F,p-unc,np2,-Log10 pvalue,qvalue,rejected,SS,DF,...,qvalue,rejected,SS,DF,F,p-unc,np2,-Log10 pvalue,qvalue,rejected
protein groups,Source,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
A0A024QZX5;A0A087X1N8;P35237,AD,1.041,1,7.713,0.006,0.039,2.220,0.018,True,1.063,1,...,0.039,True,1.149,1,8.060,0.005,0.040,2.300,0.015,True
A0A024QZX5;A0A087X1N8;P35237,age,0.002,1,0.014,0.906,0.000,0.043,0.940,False,0.004,1,...,0.966,False,0.009,1,0.066,0.797,0.000,0.099,0.862,False
A0A024QZX5;A0A087X1N8;P35237,Kiel,0.224,1,1.656,0.200,0.009,0.700,0.323,False,0.272,1,...,0.532,False,0.317,1,2.224,0.138,0.012,0.862,0.234,False
A0A024QZX5;A0A087X1N8;P35237,Magdeburg,0.429,1,3.180,0.076,0.016,1.118,0.150,False,0.500,1,...,0.343,False,0.508,1,3.565,0.061,0.018,1.218,0.121,False
A0A024QZX5;A0A087X1N8;P35237,Sweden,1.598,1,11.833,0.001,0.058,3.146,0.003,True,1.751,1,...,0.016,True,1.889,1,13.254,0.000,0.065,3.456,0.001,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S4R3U6,AD,1.218,1,2.663,0.104,0.014,0.982,0.193,False,4.195,1,...,0.829,False,2.594,1,4.930,0.028,0.025,1.560,0.064,False
S4R3U6,age,1.004,1,2.194,0.140,0.011,0.853,0.244,False,0.209,1,...,0.194,False,0.575,1,1.092,0.297,0.006,0.527,0.425,False
S4R3U6,Kiel,1.706,1,3.729,0.055,0.019,1.260,0.115,False,4.417,1,...,0.289,False,2.757,1,5.240,0.023,0.027,1.635,0.055,False
S4R3U6,Magdeburg,1.378,1,3.013,0.084,0.016,1.075,0.163,False,5.004,1,...,0.631,False,3.702,1,7.036,0.009,0.036,2.062,0.024,True


If reference dump is provided, add it to the scores

In [8]:
if args.ref_method_score:
    scores_reference = (pd
                        .read_pickle(args.ref_method_score)
                        .rename({'None': 'None (100%)'},
                                axis=1))
    scores = scores.join(scores_reference)
    logger.info(f'Added reference method scores from {args.ref_method_score}')

### Load frequencies of observed features

In [9]:
fname = args.folder_experiment / 'freq_features_observed.csv'
freq_feat = pd.read_csv(fname, index_col=0)
freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),])
freq_feat

Unnamed: 0_level_0,data
Unnamed: 0_level_1,frequency
protein groups,Unnamed: 1_level_2
A0A024QZX5;A0A087X1N8;P35237,186
A0A024R0T9;K7ER74;P02655,195
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8,174
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503,196
A0A075B6H7,91
...,...
Q9Y6R7,197
Q9Y6X5,173
Q9Y6Y8;Q9Y6Y8-2,197
Q9Y6Y9,119


### Assemble qvalues

In [10]:
qvalues = scores.loc[pd.IndexSlice[:, args.target],
                     pd.IndexSlice[:, 'qvalue']
                     ].join(freq_feat
                            ).set_index(
    ('data', 'frequency'), append=True)
qvalues.index.names = qvalues.index.names[:-1] + ['frequency']
fname = args.out_folder / 'qvalues_target.pkl'
files_out[fname.name] = fname.as_posix()
qvalues.to_pickle(fname)
qvalues.to_excel(writer, sheet_name='qvalues_all')
qvalues

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,qvalue,qvalue,qvalue,qvalue,qvalue,qvalue,qvalue,qvalue,qvalue
protein groups,Source,frequency,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
A0A024QZX5;A0A087X1N8;P35237,AD,186,0.018,0.017,0.023,0.078,0.428,0.043,0.022,0.039,0.015
A0A024R0T9;K7ER74;P02655,AD,195,0.075,0.078,0.071,0.081,0.109,0.092,0.068,0.087,0.085
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8,AD,174,0.551,0.442,0.394,0.477,0.268,0.586,0.397,0.832,0.548
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503,AD,196,0.398,0.372,0.396,0.454,0.692,0.404,0.375,0.418,0.363
A0A075B6H7,AD,91,0.009,0.029,0.048,0.078,0.400,0.027,0.018,0.124,0.012
...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6R7,AD,197,0.292,0.283,0.289,0.304,0.317,0.307,0.283,0.315,0.283
Q9Y6X5,AD,173,0.306,0.353,0.205,0.137,0.217,0.501,0.344,0.455,0.229
Q9Y6Y8;Q9Y6Y8-2,AD,197,0.162,0.157,0.160,0.172,0.184,0.174,0.157,0.178,0.157
Q9Y6Y9,AD,119,0.517,0.967,0.472,0.857,0.857,0.651,0.690,0.667,0.779


### Assemble pvalues

In [11]:
pvalues = scores.loc[pd.IndexSlice[:, args.target],
                     pd.IndexSlice[:, 'p-unc']
                     ].join(freq_feat
                            ).set_index(
    ('data', 'frequency'), append=True)
pvalues.index.names = pvalues.index.names[:-1] + ['frequency']
fname = args.out_folder / 'pvalues_target.pkl'
files_out[fname.name] = fname.as_posix()
pvalues.to_pickle(fname)
pvalues.to_excel(writer, sheet_name='pvalues_all')
pvalues

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,p-unc,p-unc,p-unc,p-unc,p-unc,p-unc,p-unc,p-unc,p-unc
protein groups,Source,frequency,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
A0A024QZX5;A0A087X1N8;P35237,AD,186,0.006,0.006,0.008,0.031,0.268,0.015,0.008,0.012,0.005
A0A024R0T9;K7ER74;P02655,AD,195,0.032,0.035,0.031,0.032,0.043,0.037,0.030,0.033,0.039
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8,AD,174,0.417,0.312,0.264,0.327,0.138,0.432,0.269,0.736,0.420
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503,AD,196,0.263,0.249,0.266,0.304,0.547,0.254,0.250,0.259,0.241
A0A075B6H7,AD,91,0.003,0.011,0.020,0.031,0.243,0.008,0.006,0.053,0.004
...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6R7,AD,197,0.175,0.175,0.175,0.175,0.175,0.175,0.175,0.175,0.175
Q9Y6X5,AD,173,0.185,0.233,0.113,0.063,0.104,0.344,0.224,0.291,0.133
Q9Y6Y8;Q9Y6Y8-2,AD,197,0.083,0.083,0.083,0.083,0.083,0.083,0.083,0.083,0.083
Q9Y6Y9,AD,119,0.381,0.953,0.334,0.782,0.766,0.505,0.576,0.520,0.690


### Assemble rejected features

In [12]:
da_target = scores.loc[pd.IndexSlice[:, args.target],
                       pd.IndexSlice[:, 'rejected']
                       ].join(freq_feat
                              ).set_index(
    ('data', 'frequency'), append=True)
da_target.index.names = da_target.index.names[:-1] + ['frequency']
fname = args.out_folder / 'equality_rejected_target.pkl'
files_out[fname.name] = fname.as_posix()
da_target.to_pickle(fname)
count_rejected = njab.pandas.combine_value_counts(da_target.droplevel(-1, axis=1))
count_rejected.to_excel(writer, sheet_name='count_rejected')
count_rejected

Unnamed: 0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF
False,973,937,936,994,1029,1054,943,1069,945
True,448,484,485,427,392,367,478,352,476


### Tabulate rejected decisions by method:

In [13]:
# ! This uses implicitly that RSN is not available for some protein groups
# ! Make an explicit list of the 313 protein groups available in original data
mask_common = da_target.notna().all(axis=1)
count_rejected_common = njab.pandas.combine_value_counts(da_target.loc[mask_common].droplevel(-1, axis=1))
count_rejected_common.to_excel(writer, sheet_name='count_rejected_common')
count_rejected_common

Unnamed: 0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF
False,973,937,936,994,1029,1054,943,1069,945
True,448,484,485,427,392,367,478,352,476


### Tabulate rejected decisions by method for newly included features (if available)

In [14]:
count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))
count_rejected_new.to_excel(writer, sheet_name='count_rejected_new')
count_rejected_new

Unnamed: 0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF


### Tabulate rejected decisions by method for all features

In [15]:
da_target.to_excel(writer, sheet_name='equality_rejected_all')
logger.info("Written to sheet 'equality_rejected_all' in excel file.")
da_target

root - INFO     Written to sheet 'equality_rejected_all' in excel file.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,RF,DAE,TRKNN,QRILC,PI,None,VAE,Median,CF
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejected,rejected,rejected,rejected,rejected,rejected,rejected,rejected,rejected
protein groups,Source,frequency,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
A0A024QZX5;A0A087X1N8;P35237,AD,186,True,True,True,False,False,True,True,True,True
A0A024R0T9;K7ER74;P02655,AD,195,False,False,False,False,False,False,False,False,False
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8,AD,174,False,False,False,False,False,False,False,False,False
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503,AD,196,False,False,False,False,False,False,False,False,False
A0A075B6H7,AD,91,True,True,True,False,False,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6R7,AD,197,False,False,False,False,False,False,False,False,False
Q9Y6X5,AD,173,False,False,False,False,False,False,False,False,False
Q9Y6Y8;Q9Y6Y8-2,AD,197,False,False,False,False,False,False,False,False,False
Q9Y6Y9,AD,119,False,False,False,False,False,False,False,False,False


Tabulate number of equal decison by method (`True`) to the ones with varying 
decision depending on the method (`False`)

In [16]:
da_target_same = (da_target.sum(axis=1) == 0) | da_target.all(axis=1)
da_target_same.value_counts()

True    1,097
False     324
Name: count, dtype: int64

List frequency of features with varying decisions

In [17]:
feat_idx_w_diff = da_target_same[~da_target_same].index
feat_idx_w_diff.to_frame()[['frequency']].reset_index(-1, drop=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency
protein groups,Source,Unnamed: 2_level_1
A0A024QZX5;A0A087X1N8;P35237,AD,186
A0A075B6H7,AD,91
A0A075B6I0,AD,194
A0A075B6J9,AD,156
A0A075B6Q5,AD,104
...,...,...
Q9UPU3,AD,163
Q9UQ52,AD,188
Q9Y281;Q9Y281-3,AD,51
Q9Y6C2,AD,119


take only those with different decisions

In [18]:
(qvalues
 .loc[feat_idx_w_diff]
 .sort_values(('None', 'qvalue'))
 .to_excel(writer, sheet_name='qvalues_diff')
 )

(qvalues
 .loc[feat_idx_w_diff]
 .loc[mask_common]  # mask automatically aligned
 .sort_values(('None', 'qvalue'))
 .to_excel(writer, sheet_name='qvalues_diff_common')
 )

try:
    (qvalues
     .loc[feat_idx_w_diff]
     .loc[~mask_common]
     .sort_values(('None', 'qvalue'))
     .to_excel(writer, sheet_name='qvalues_diff_new')
     )
except IndexError:
    print("No new features or no new ones (with diverging decisions.)")
writer.close()

No new features or no new ones (with diverging decisions.)


## Plots for inspecting imputations (for diverging decisions)

In [19]:
if not args.make_plots:
    logger.warning("Not plots requested.")
    import sys
    sys.exit(0)



SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Load target

In [None]:
target = pd.read_csv(args.fn_clinical_data,
                     index_col=0,
                     usecols=[args.sample_id_col, args.target])
target = target.dropna()
target

In [None]:
target_to_group = target.copy()
target = target >= args.cutoff_target
target = target.replace({False: f'{args.target} < {args.cutoff_target}',
                        True: f'{args.target} >= {args.cutoff_target}'}
                        ).astype('category')
pd.crosstab(target.squeeze(), target_to_group.squeeze())

## Measurments

In [None]:
data = pimmslearn.io.datasplits.DataSplits.from_folder(
    args.data,
    file_format=args.file_format)
data = pd.concat([data.train_X, data.val_y, data.test_y]).unstack()
data

plot all of the new pgs which are at least once significant which are not already dumped.

In [None]:
feat_new_abundant = da_target.loc[~mask_common].any(axis=1)
feat_new_abundant = feat_new_abundant.loc[feat_new_abundant].index.get_level_values(0)
feat_new_abundant

In [None]:
feat_sel = feat_idx_w_diff.get_level_values(0)
feat_sel = feat_sel.union(feat_new_abundant)
len(feat_sel)

In [None]:
data = data.loc[:, feat_sel]
data

- RSN prediction are based on all samples mean and std (N=455) as in original study
- VAE also trained on all samples (self supervised)
One could also reduce the selected data to only the samples with a valid target marker,
but this was not done in the original study which considered several different target markers.

RSN : shifted per sample, not per feature!

Load all prediction files and reshape

In [None]:
# exclude 'None' as this is without imputation (-> data)
model_keys = [k for k in qvalues.columns.get_level_values(0) if k != 'None']
pred_paths = [
    args.out_preds / args.template_pred.format(method)
    for method in model_keys]
pred_paths

In [None]:
load_single_csv_pred_file = pimmslearn.analyzers.compare_predictions.load_single_csv_pred_file
pred_real_na = dict()
for method in model_keys:
    fname = args.out_preds / args.template_pred.format(method)
    print(f"missing values pred. by {method}: {fname}")
    pred_real_na[method] = load_single_csv_pred_file(fname)
pred_real_na = pd.DataFrame(pred_real_na)
pred_real_na

Once imputation, reduce to target samples only (samples with target score)

In [None]:
# select samples with target information
data = data.loc[target.index]
pred_real_na = pred_real_na.loc[target.index]

# assert len(data) == len(pred_real_na)

In [None]:
idx = feat_sel[0]

In [None]:
feat_observed = data[idx].dropna()
feat_observed

In [None]:
# axes = axes.ravel()
# args.out_folder.parent / 'intensity_plots'
# each feature -> one plot?
# plot all which are at least for one method significant?
folder = args.out_folder / 'intensities_for_diff_in_DA_decision'
folder.mkdir(parents=True, exist_ok=True)

In [None]:
min_y_int, max_y_int = pimmslearn.plotting.data.get_min_max_iterable(
    [data.stack(), pred_real_na.stack()])
min_max = min_y_int, max_y_int

target_name = target.columns[0]

min_max, target_name

## Compare with target annotation

In [None]:
# labels somehow?
# target.replace({True: f' >={args.cutoff_target}', False: f'<{args.cutoff_target}'})

for i, idx in enumerate(feat_sel):
    print(f"Swarmplot {i:3<}: {idx}:")
    fig, ax = plt.subplots()

    # dummy plots, just to get the Path objects
    tmp_dot = ax.scatter([1, 2], [3, 4], marker='X')
    new_mk, = tmp_dot.get_paths()
    tmp_dot.remove()

    feat_observed = data[idx].dropna()

    def get_centered_label(method, n, q):
        model_str = f'{method}'
        stats_str = f'(N={n:,d}, q={q:.3f})'
        if len(model_str) > len(stats_str):
            stats_str = f"{stats_str:<{len(model_str)}}"
        else:
            model_str = f"{model_str:<{len(stats_str)}}"
        return f'{model_str}\n{stats_str}'

    key = get_centered_label(method='observed',
                             n=len(feat_observed),
                             q=float(qvalues.loc[idx, ('None', 'qvalue')])
                             )
    to_plot = {key: feat_observed}
    for method in model_keys:
        try:
            pred = pred_real_na.loc[pd.IndexSlice[:,
                                                  idx], method].dropna().droplevel(-1)
            if len(pred) == 0:
                # in case no values was imputed -> qvalue is as based on measured
                key = get_centered_label(method=method,
                                         n=len(pred),
                                         q=float(qvalues.loc[idx, ('None', 'qvalue')]
                                                 ))
            elif qvalues.loc[idx, (method, 'qvalue')].notna().all():
                key = get_centered_label(method=method,
                                         n=len(pred),
                                         q=float(qvalues.loc[idx, (method, 'qvalue')]
                                                 ))
            elif qvalues.loc[idx, (method, 'qvalue')].isna().all():
                logger.info(f"NA qvalues for {idx}: {method}")
                continue
            else:
                raise ValueError("Unknown case.")
            to_plot[key] = pred
        except KeyError:
            print(f"No missing values for {idx}: {method}")
            continue

    to_plot = pd.DataFrame.from_dict(to_plot)
    to_plot.columns.name = 'group'
    groups_order = to_plot.columns.to_list()
    to_plot = to_plot.stack().to_frame('intensity').reset_index(-1)
    to_plot = to_plot.join(target.astype('category'), how='inner')
    to_plot = to_plot.astype({'group': 'category'})

    ax = seaborn.swarmplot(data=to_plot,
                           x='group',
                           y='intensity',
                           order=groups_order,
                           dodge=True,
                           hue=args.target,
                           size=2,
                           ax=ax)
    first_pg = idx.split(";")[0]
    ax.set_title(
        f'Imputation for protein group {first_pg} with target {target_name} (N= {len(data):,d} samples)')

    _ = ax.set_ylim(min_y_int, max_y_int)
    _ = ax.locator_params(axis='y', integer=True)
    _ = ax.set_xlabel('')
    _xticks = ax.get_xticks()
    ax.xaxis.set_major_locator(
        matplotlib.ticker.FixedLocator(_xticks)
    )
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                           horizontalalignment='right')

    N_hues = len(pd.unique(to_plot[args.target]))

    _ = ax.collections[0].set_paths([new_mk])
    _ = ax.collections[1].set_paths([new_mk])

    label_target_0, label_target_1 = ax.collections[-2].get_label(), ax.collections[-1].get_label()
    _ = ax.collections[-2].set_label(f'imputed, {label_target_0}')
    _ = ax.collections[-1].set_label(f'imputed, {label_target_1}')
    _obs_label0 = ax.scatter([], [], color='C0', marker='X', label=f'observed, {label_target_0}')
    _obs_label1 = ax.scatter([], [], color='C1', marker='X', label=f'observed, {label_target_1}')
    _ = ax.legend(
        handles=[_obs_label0, _obs_label1, *ax.collections[-4:-2]],
        fontsize=5, title_fontsize=5, markerscale=0.4,)
    fname = (folder /
             f'{first_pg}_swarmplot.pdf')
    files_out[fname.name] = fname.as_posix()
    pimmslearn.savefig(
        fig,
        name=fname)
    plt.close()

Saved files:

In [None]:
files_out