In [None]:
usage = """Run with papermill:
     
papermill srsnv_report.ipynb output_srsnv_report.ipynb \
    -p report_name <> \
    -p model_file <> \
    -p params_file <> \
    -p output_roc_plot <> \
    -p output_LoD_plot <> \
    -p output_LoD_qual_plot <> \
    -p output_cm_plot <> \
    -p output_obsereved_qual_plot <> \
    -p output_ML_qual_hist <> \
    -p output_qual_per_feature <> \
    -p output_bepcr_hists <> \
    -p output_bepcr_fpr <> \
    -p output_bepcr_recalls <>
Then convert to html

jupyter nbconvert --to html output_srsnv_report.ipynb --no-input --output srsnv_report.html"""

In [1]:
import pandas as pd
import os
from IPython.display import Image, HTML, display
import joblib
import json

pd.options.display.max_rows = 200

import matplotlib.pyplot as plt
import matplotlib.image as mpimg



In [None]:
# papermill parameters
report_name = None
model_file = None
params_file = None
output_roc_plot = None
output_LoD_plot = None
output_LoD_qual_plot = None
output_cm_plot = None
output_obsereved_qual_plot = None
output_ML_qual_hist = None
output_qual_per_feature = None
output_bepcr_hists = None
output_bepcr_fpr = None
output_bepcr_recalls = None

In [None]:
# check that we got all the inputs
missing = list()
for varname in [
    "report_name",
    "model_file",
    "params_file",
    "output_roc_plot",
    "output_LoD_plot",
    "output_LoD_qual_plot",
    "output_cm_plot",
    "output_obsereved_qual_plot",
    "output_ML_qual_hist",
    "output_qual_per_feature",
    "output_bepcr_hists",
    "output_bepcr_fpr",
    "output_bepcr_recalls",
]:
    if locals()[varname] is None:
        missing.append(varname)

if len(missing) > 0:
    raise ValueError(f"Following inputs missing:\n{(os.linesep).join(missing)}")

In [None]:
# load files
model = joblib.load(model_file)
if isinstance(model, list): # For models saved from CV
    model = model[0]
with open(params_file, 'r', encoding="utf-8") as f:
    params = json.load(f)

In [None]:
def display_test_train(image_path,titlestr):
    image_path1 = image_path+'.png'
    image_path2 = image_path.replace("test","train")+'.png'

    img1 = mpimg.imread(image_path1)
    img2 = mpimg.imread(image_path2)

    fig, ax = plt.subplots(1, 2, figsize=(20, 10),constrained_layout=True)
    ax[0].imshow(img1)
    ax[0].axis('off')
    ax[0].set_title("Test",fontsize=20)
    ax[1].imshow(img2)
    ax[1].axis('off')
    ax[1].set_title("Train",fontsize=20)
    
    fig.suptitle(titlestr,fontsize=24,y=0.95)
    plt.show()

dataname = params_file.split('/')[-1].split('.')[0]


In [None]:
display(HTML(f'<font size="6">SRSNV pipeline report </font>'))

* This report contains an analysis of the SRSNV model training.
* We train as binary classifier per read. 
* The probabilities are translated to quality: quality = -10*log10(probability). 
* The quality is used as a threshold for discriminating true and false variants.

## Residual SNV rate vs Retention and LoD simulation

We calculate the residual SNV rate as following: 
```
error rate in test data = # errors / # bases sequenced
```
where:
```
# errors = # of single substitution snps > filter thresh
# bases sequenced = # of bases aligned * % mapq60 * ratio_of_bases_in_coverage_range *
                    read_filter_correction_factor * recall[threshold]
```
and: 
```
# of bases aligned = mean_coverage * bases in region * downsampling factor
downsampling factor = % of the featuremap reads sampled for test set
```

In [None]:
image_path1 = output_LoD_plot+'.png'
display(HTML(f'<font size="6">Test LoD simulation </font>'))
display(Image(filename=image_path1, width=800, height=800))
image_path2 = output_LoD_plot.replace("test","train")+'.png'
display(HTML(f'<font size="6">Train LoD simulation </font>'))
display(Image(filename=image_path2, width=800, height=800))

In [None]:
display_test_train(output_LoD_qual_plot,"LoD vs. ML qual \n"+dataname)

In [None]:
display_test_train(output_roc_plot,"ROC curve \n"+dataname)

# Training metrics

In [None]:
title = 'Confusion matrix'
display(HTML(f'<font size="4">{title}</font>'))
display_test_train(output_cm_plot,dataname)

In [None]:
title = 'ML qual hists by class'
display(HTML(f'<font size="4">{title}</font>'))
display_test_train(output_ML_qual_hist,dataname)

display(HTML(f'<font size="4">Stratified by category </font>'))
subset_data_list = [
    'mixed_cycle_skip',
    'mixed_non_cycle_skip',
    'non_mixed_cycle_skip',
    'non_mixed_non_cycle_skip',
    'cycle_skip',
    'non_cycle_skip',
]

for suffix in subset_data_list:
    image_path = output_bepcr_hists + suffix    
    if os.path.isfile(image_path+'.png'):
        display_test_train(image_path,dataname)

display(HTML(f'<font size="4">ML qual calibration by category </font>'))
display_test_train(output_bepcr_fpr,dataname)

display(HTML(f'<font size="4">Recall rate by category </font>'))
display_test_train(output_bepcr_recalls,dataname)

In [None]:
display(HTML(f'<font size="4">Feature distribution per label</font>'))
for f in model.feature_names_in_:
    image_path = output_qual_per_feature + f
    if os.path.isfile(image_path + '.png'):        
        display_test_train(image_path,dataname)


In [None]:

display(HTML(f'<font size="4">Input parameters: </font>'))

for item in params['model_parameters']:
    print(f"    * {item}: {params['model_parameters'][item]}")

params_for_print = [
    'numerical_features',
    'categorical_features',
    'train_set_size',   
    'test_set_size',    
]
for p in params_for_print:    
    if (type(params[p]) == list):
        print(f"    * {p}:")
        for pp in params[p]:
            print(f"        - {pp}")
    else:
        print(f"    * {p}: {params[p]}")
