In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nexusplt as nxp
from ugvc.reports.report_utils import ShortReportUtils, parse_config
from ugvc.reports.report_data_loader import ReportDataLoader
from IPython.display import display, Markdown

pd.options.display.float_format = '{:,.2%}'.format
pd.set_option('display.max_colwidth', None)

In [None]:
parameters, param_names = parse_config('var_report.config')

# globals
image_dir = 'plots'
image_prefix = parameters['image_prefix']
reference_version = parameters['reference_version']
h5outfile = parameters['h5outfile']
trained_w_gt = parameters['trained_w_gt']
verbosity = parameters['verbosity']
pipeline_version = parameters['pipeline_version']

report_utils = ShortReportUtils(image_dir, image_prefix, verbosity, h5outfile)

if verbosity > 1:
    display(Markdown(f"# Variant Calling report v{pipeline_version} (INTERNAL usage)"))
else:
    display(Markdown(f"# Variant Calling report v{pipeline_version}"))

## 1. Input Parameters

In [None]:
# load data
data_loader = ReportDataLoader(concordance_file=parameters['h5_concordance_file'],
                               reference_version=reference_version)
data = {
    'Trained wo gt': data_loader.load_concordance_df(),
    'whole genome': data_loader.load_whole_genome_concordance_df()
}

sources = data_loader.sources

In [None]:
if 'well_mapped_coverage' in data['whole genome'].columns:
    parameters['mean_var_depth'] = '{:.2f}'.format(data['whole genome']['well_mapped_coverage'].mean())
    param_names.append('mean_var_depth')

try:
    args = pd.read_hdf(data_loader.sources['Trained wo gt'][0], 'input_args', mode='r')
    parameters['truth_sample_name'] = args['truth_sample_name'][0]
except:
    pass

parameters_df = pd.DataFrame.from_dict(parameters, orient='index', columns=['value']).reindex(param_names)
parameters_df.to_hdf(h5outfile, key="parameters")
display(parameters_df)

## 2. Performance over all Data
The concordance between the variant calling results and the ground truth sample is presented below.
* Red line - precision and recall over different tree-scores.
* Red dot - precision and recall values for the chosen threshold.
* Black dot -precision and recall after filtering systematic errors (SEC)

In [None]:
pd.options.display.float_format = '{:,.2%}'.format
if verbosity > 1:
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
              'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8',
              'hmer Indel >8,<=10', 'hmer Indel >10,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
else:
    categories = ['SNP', 'Indel']

df = report_utils.basic_analysis(data, categories, sources, 'all_data', 'sec_data')
display(df)

In [None]:
if verbosity > 1:
    display(Markdown(f"## 2.1 Stratified by base"))
    display(Markdown(f"#### (A,T)"))
    categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10',
                  'hmer Indel >10,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
    opt_tab_a_t = report_utils.base_stratification_analysis(data, categories, sources, ('A', 'T'))

In [None]:
if verbosity > 1:
    display(Markdown(f"#### (G,C)"))
    categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']
    opt_tab_g_c = report_utils.base_stratification_analysis(data, categories, sources,('G', 'C'))    

In [None]:
if verbosity > 1:
    df = report_utils.merge_table_dictionaries_into_df(opt_tab_a_t, opt_tab_g_c, sources)
    df[['whole genome']].to_hdf(h5outfile, key="all_data_per_base")
    display(df[['whole genome']])

## 3. Performance over UG high confidence regions

Variant calling peformance exclusing genomic areas where UG performance is poor, i.e:
- Homopolymers - runs of length 11 bp and above, padded with four bases around the genomic coordinates,
- AT-rich regions - bases where the GC content of the surrounding 40 bases is lower than 5%,
- Tandem repeats,
- Low mapping quality - regions that are covered by at least 20 reads, but less than 10% of these reads are aligned with mapping quality > 20,
- High coverage variability - regions with coverage that is highly variable between samples (std/mean > 0.5)

In [None]:
pd.options.display.float_format = '{:,.2%}'.format
filtData = {}
for s in sources:
    d = data[s]
    if s == 'whole genome':
        filtData[s] = d.query("ug_hcr==True").copy()
    else:
        filtData[s] = d.copy()

if verbosity > 1:
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
              'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
else:
    categories = ['SNP', 'Indel']
    
df = report_utils.basic_analysis(filtData, categories, sources, 'ug_hcr', 'ug_hcr_sec_data')
display(df)

In [None]:
if verbosity > 1:
    display(Markdown(f"### 3.1 Homozygous genotyping accuracy"))
    display(Markdown(f"The precision and recall of called homozygous variants (where the variant was not classified as False Negative)"))
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']
    df = report_utils.homozygous_genotyping_analysis(filtData, categories, sources)
    df[['whole genome']].to_hdf(h5outfile, key="ug_hcr_homozygous")
    display(df[['whole genome']])

In [None]:
if verbosity > 1:
    display(Markdown(f"## 4. Performance over regions with coverage>=20 and excluding areas with mappability"))
    if 'well_mapped_coverage' in data['whole genome'].columns:
        filtData = {}
        for s in sources:
            d = data[s]
            filtData[s] = d[(d['well_mapped_coverage'] >= 20) & (d['mappability.0'])]

        categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                      'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
        df = report_utils.basic_analysis(filtData, categories, sources, 'good_cvg_data')
        display(df)
    else:
        print("No coverage data available")

In [None]:
if verbosity > 1:
    display(Markdown(f"### 4.1 Homozygous genotyping accuracy"))
    display(Markdown(f"The precision and recall of called homozygous variants (where the variant was not classfied as False Negative)"))
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
    df = report_utils.homozygous_genotyping_analysis(filtData, categories, sources)
    df[['whole genome']].to_hdf(h5outfile, key="good_cvg_data_homozygous")
    display(df[['whole genome']])

In [None]:
# A block comparing model with and without ground truth . Only shown if h5_model_file is provided in the config file
if trained_w_gt is not None and verbosity > 1:
    display(Markdown("""
##  5. Trained with and without Ground Truth
<ul>
<li><b>Trained wo gt - Trained without ground truth </b></li>
Random forest model trained on chromosome 9 using known variants in dbSNP and on common fp variants
<li><b>Trained with gt - Trained with ground truth</b></li>
Simple threshold model trained on chromosome 9 using its own ground truth
</ul>
"""))
    data['Trained with gt'] = data_loader.load_trained_without_gt_concordance_df(trained_w_gt)
    sources = data_loader.sources
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                  'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    opt_tab, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary',
                                  sources={'Trained wo gt': sources['Trained wo gt'],
                                           'Trained with gt': sources['Trained with gt']},
                                  legend=True)

    pd.options.display.float_format = '{:,.2%}'.format
    sources = {'Trained wo gt': sources['Trained wo gt'], 'Trained with gt': sources['Trained with gt']}
    df = pd.concat([opt_tab[s] for s in sources], axis=1, keys=[s for s in sources])
    df.to_hdf(h5outfile, key="trained_w_wo_gt")
    display(df)

In [None]:
if trained_w_gt is not None and verbosity > 1:
    display(Markdown("""
##  5. Trained with and without Ground Truth
<ul>
<li><b>Trained wo gt - Trained without ground truth </b></li>
Random forest model trained on chromosome 9 using known variants in dbSNP and on common fp variants
<li><b>Trained with gt - Trained with ground truth</b></li>
Simple threshold model trained on chromosome 9 using its own ground truth
</ul>
"""))
    data_loader.load_trained_without_gt_concordance_df(trained_w_gt)
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
    opt_tab, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary',
                                  sources={'Trained wo gt': sources['Trained wo gt'],
                                           'Trained with gt': sources['Trained with gt']},
                                  legend=True)

    pd.options.display.float_format = '{:,.2%}'.format
    sources = {'Trained wo gt': sources['Trained wo gt'], 'Trained with gt': sources['Trained with gt']}
    df = pd.concat([opt_tab[s] for s in sources], axis=1, keys=[s for s in sources])
    df.to_hdf(h5outfile, key="trained_w_wo_gt")
    display(df)

In [None]:
if ('well_mapped_coverage' in data['whole genome'].columns) and (trained_w_gt is not None) and verbosity > 1:
    filtData = {}
    d = data['Trained with gt']
    filtData['Trained with gt'] = d[(d['well_mapped_coverage'] >= 20) &
                                    (d['mappability.0'])]

    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                  'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    optTab, opt_res, perf_curve = report_utils.get_performance(filtData, categories,
                                                               {'Trained with gt': sources['Trained with gt']})

    %matplotlib agg
    d = optTab['Trained with gt'][['max recall', 'recall', 'precision']]
    labels = ['SNP', 'Indel', 'nhmer', 'nhmer w/o LCR', 'hmer 2-4', 'hmer 5-8', 'hmer 4', 'hmer 5', 'hmer 6', 'hmer 7',
              'hmer 8', 'hmer 9-10']
    fig = plt.figure()
    ax = d.plot()
    plt.xticks(np.arange(len(d.index)), rotation=30, ha='right')
    ax.set_xticklabels(labels)
    plt.ylim([0.4, 1.05])
    plt.grid()
    plt.title('Cvg>20X, Trained variant calls')
    plt.tight_layout()
    nxp.save(fig, image_prefix + 'summary', 'png', outdir=image_dir)
    plt.close(fig)