In [None]:
usage = """Run with papermill:
     
papermill srsnv_report.ipynb output_srsnv_report.ipynb \
    -p report_name <> \
    -p model_file <> \
    -p params_file <> \
    -p output_roc_plot <> \
    -p output_LoD_plot <> \
    -p output_cm_plot <> \
    -p output_precision_recall_qual <> \
    -p output_qual_density <> \
    -p output_obsereved_qual_plot <> \
    -p output_ML_qual_hist <> \
    -p output_qual_per_feature <> \
    -p output_bepcr_hists <> \
    -p output_bepcr_fpr <> \
    -p output_bepcr_recalls <>
Then convert to html

jupyter nbconvert --to html output_srsnv_report.ipynb --no-input --output srsnv_report.html"""

In [1]:
import pandas as pd
import os
from IPython.display import Image, HTML, display
import joblib
import json

pd.options.display.max_rows = 200

In [None]:
# papermill parameters
report_name = None
model_file = None
params_file = None
output_roc_plot = None
output_LoD_plot = None
output_cm_plot = None
output_precision_recall_qual = None
output_qual_density = None
output_obsereved_qual_plot = None
output_ML_qual_hist = None
output_qual_per_feature = None
output_bepcr_hists = None
output_bepcr_fpr = None
output_bepcr_recalls = None

In [None]:
# check that we got all the inputs
missing = list()
for varname in [
    "report_name",
    "model_file",
    "params_file",
    "output_roc_plot",
    "output_LoD_plot",
    "output_cm_plot",
    "output_precision_recall_qual",
    "output_qual_density",
    "output_obsereved_qual_plot",
    "output_ML_qual_hist",
    "output_qual_per_feature",
    "output_bepcr_hists",
    "output_bepcr_fpr",
    "output_bepcr_recalls",
]:
    if locals()[varname] is None:
        missing.append(varname)

if len(missing) > 0:
    raise ValueError(f"Following inputs missing:\n{(os.linesep).join(missing)}")

In [None]:
# load files
model = joblib.load(model_file)
with open(params_file, 'r', encoding="utf-8") as f:
    params = json.load(f)

In [None]:
display(HTML(f'<font size="6">SRSNV report - {report_name} set</font>'))

* This report contains an analysis of the SRSNV model training.
* We train as binary classifier per read. 
* The probabilities are translated to quality: quality = -10*log10(probability). 
* The quality is used as a threshold for discriminating true and false variants.

In [None]:
print('Info and model parameters:')

params_for_print = [
    'train_set_size',
    'test_set_size',
    'model_parameters',
]
for p in params_for_print:
    print(f"{p}: {params[p]}")

print(f"Model features: \n{model.feature_names_in_}")

# AUC curve

In [None]:
Image(output_roc_plot+'.png', width=600)

## LoD

We calculate the residual snv rate as following: 

error rate in test data = # errors / # bases sequenced

where:

\# errors = # of single substitution snps > filter thresh
\# bases sequenced = # of bases aligned * % mapq60 * ratio_of_bases_in_coverage_range * read_filter_correction_factor * recall\[threshold\]

and: 
\# of bases aligned = mean_coverage * bases in region * downsampling factor
downsampling factor = % of the featuremap reads sampled for test set


In [None]:
Image(output_LoD_plot+'.png', width=800)

# Training metrics

In [None]:
title = 'model confusion matrix'
display(HTML(f'<font size="6">{title}</font>'))
Image(output_cm_plot+'.png', width=400)

In [None]:
title = 'Precision and recall vs quality'
display(HTML(f'<font size="6">{title}</font>'))

display(Image(output_precision_recall_qual+'.png', width=600))
display(Image(output_qual_density+'.png', width=600))

In [None]:
title = 'Model qual metrics: hists by class, calibration'
display(HTML(f'<font size="6">{title}</font>'))
display(Image(output_ML_qual_hist+'.png', width=600))
display(Image(output_obsereved_qual_plot+'.png', width=600))


In [None]:
if 'strand_ratio_category_end' in model.feature_names_in_ and 'strand_ratio_category_start' in model.feature_names_in_:
    display(HTML(f'<font size="6">Balanced ePCR</font>'))

    mixed_cs_list = [
        'mixed_cs',
        'mixed_non_cs',
        'non_mixed_cs',
        'non_mixed_non_cs',
    ]

    for suffix in mixed_cs_list:
        image_path = output_bepcr_hists + suffix + '.png'
        if os.path.isfile(image_path):
            display(Image(image_path, width=600))


    display(Image(output_bepcr_fpr + '.png', width=600))

    display(Image(output_bepcr_recalls + '.png', width=600))

In [None]:
display(HTML(f'<font size="6">qual per feature</font>'))
for f in model.feature_names_in_:
    image_path = output_qual_per_feature + f + '.png'
    if os.path.isfile(image_path):
        display(Image(image_path, width=600))
