In [None]:
import pandas as pd
from ugvc.reports.report_utils import ReportUtils
from ugvc.reports.report_data_loader import ReportDataLoader
from IPython.display import display, Markdown

pd.options.display.float_format = '{:,.2%}'.format
pd.set_option('display.max_colwidth', None)

In [None]:
run_id = "NA"
pipeline_version = "NA"
h5_concordance_file = "comp.h5"
verbosity = 3
reference_version = "hg38"
truth_sample_name = "NA"
h5outfile = "var_report.h5"    
exome_column_name = "exome.twist"
model_name = None
model_pkl = None

In [None]:
parameters = {
    'h5_concordance_file': h5_concordance_file,
    'run_id': run_id,
    'pipeline_version': str(pipeline_version),
    'verbosity': str(verbosity),
    'reference_version': reference_version,
    'truth_sample_name': truth_sample_name,
    'h5outfile': h5outfile,
    'model_pkl': model_pkl,
    'model_name': model_name
}

parameters = {name: parameters[name] for name in parameters if parameters[name] is not None}
param_names = list(parameters)

report_utils = ReportUtils(verbosity, h5outfile)

if verbosity > 1:
    display(Markdown(f"# Variant Calling report {pipeline_version} (INTERNAL usage)"))
else:
    display(Markdown(f"# Variant Calling report {pipeline_version}"))

## 1. Input Parameters <a class="anchor" id="input_parameters"></a>

In [None]:
# load data
data_loader = ReportDataLoader(concordance_file=parameters['h5_concordance_file'],
                               reference_version=reference_version,
                              exome_column_name=exome_column_name)
data = data_loader.load_concordance_df()

In [None]:
if 'well_mapped_coverage' in data.columns:
    parameters['mean_var_depth'] = '{:.2f}'.format(data['well_mapped_coverage'].mean())
    param_names.append('mean_var_depth')

parameters_df = pd.DataFrame.from_dict(parameters, orient='index', columns=['value']).reindex(param_names)
parameters_df.to_hdf(h5outfile, key="parameters")
display(parameters_df)

In [None]:
anchor = report_utils.get_anchor("top")
display(Markdown(f"## Table of Contents {anchor}"))
tc = "* [Input Parameters](#input_parameters)\n"
tc += "* [All data](#all_data)\n"
if verbosity > 1:
    tc += "\t* [General accuracy](#gen_acc_all_data)\n"
    tc += "\t* [Error types](#err_types_all_data)\n"
    tc += "\t* [Stratified analysis](#all_data_bases)\n"
    tc += "\t* [Homozygous genotyping](#all_data_hom)\n"
tc += "* [UG high confidence regions](#ug_hcr)\n"
if verbosity > 1:
    tc += "\t* [General accuracy](#gen_acc_ug_hcr)\n"
    tc += "\t* [Error types](#err_types_ug_hcr)\n"
    tc += "\t* [Homozygous genotyping](#ug_hcr_hom)\n"

tc += "* [Exome](#exome)\n"
if verbosity > 1:
    tc += "\t* [General accuracy](#gen_acc_exome)\n"
    tc += "\t* [Error types](#err_types_exome)\n"
    tc += "\t* [Indel error examples](#exome_indel_errors)\n"
    tc += "\t* [snp errors examples](#exome_snp_errors)\n"


if verbosity > 2:
    tc += "* [Indel analysis](#indel_analysis)\n"
    tc += "\t* [All data](#all_data_indel_analysis)\n"
    tc += "\t* [UG HCR](#ug_hcr_indel_analysis)\n"
    tc += "\t* [Exome](#exome_indel_analysis)\n"
display(Markdown(tc))

## 2. Performance over all Data <a class="anchor" id="all_data"></a> 
The concordance between the variant calling results and the ground truth sample is presented below.
* Red line - precision and recall over different tree-scores.
* Red dot - precision and recall values for the chosen threshold.
* Black dot -precision and recall after filtering systematic errors (SEC)

[top](#top)

In [None]:
pd.options.display.float_format = '{:,.2%}'.format
if verbosity > 1:
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
              'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8',
              'hmer Indel >8,<=10', 'hmer Indel >10,<=12', 'hmer Indel >12,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
else:
    categories = ['SNP', 'Indel']

report_utils.basic_analysis(data, categories, 'all_data', 'sec_data')

In [None]:
if verbosity > 1:
    anchor = report_utils.get_anchor('all_data_bases')
    display(Markdown(f"## 2.1 Stratified by base {anchor}"))
    display(Markdown(f"#### (A,T)"))
    categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10',
                  'hmer Indel >10,<=12', 'hmer Indel >12,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
    at_df = report_utils.base_stratification_analysis(data, categories, ('A', 'T'))

In [None]:
if verbosity > 1:
    display(Markdown(f"#### (G,C)"))
    categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']
    gc_df = report_utils.base_stratification_analysis(data, categories,('G', 'C')) 
    
    base_strat_df = pd.concat([at_df, gc_df])
    report_utils.make_multi_index(base_strat_df)
    base_strat_df.to_hdf(h5outfile, key="all_data_per_base")

In [None]:
if verbosity > 1:
    anchor = report_utils.get_anchor('all_data_hom')
    display(Markdown(f"### 2.2 Homozygous genotyping accuracy {anchor}"))
    display(Markdown(f"The precision and recall of called homozygous variants (where the variant was not classified as False Negative)"))
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 
                  'hmer Indel >8,<=10', 'hmer Indel >10,<=12', 'hmer Indel >12,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
    report_utils.homozygous_genotyping_analysis(data, categories, "all_data_homozygous")

## 3. Performance over UG high confidence regions <a class="anchor" id="ug_hcr"></a>

Variant calling peformance exclusing genomic areas where UG performance is poor, i.e:
- Homopolymers - runs of length 11 bp and above, padded with four bases around the genomic coordinates,
- AT-rich regions - bases where the GC content of the surrounding 40 bases is lower than 5%,
- Tandem repeats,
- Low mapping quality - regions that are covered by at least 20 reads, but less than 10% of these reads are aligned with mapping quality > 20,
- High coverage variability - regions with coverage that is highly variable between samples (std/mean > 0.5)

[top](#top)

In [None]:
pd.options.display.float_format = '{:,.2%}'.format
ug_hcr_data = data.query("ug_hcr==True").copy()

if verbosity > 1:
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
              'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
else:
    categories = ['SNP', 'Indel']
    
report_utils.basic_analysis(ug_hcr_data, categories, 'ug_hcr', 'ug_hcr_sec_data')

In [None]:
if verbosity > 1:
    anchor = report_utils.get_anchor('ug_hcr_hom')
    display(Markdown(f"### 3.1 Homozygous genotyping accuracy {anchor}"))
    display(Markdown(f"The precision and recall of called homozygous variants (where the variant was not classified as False Negative)"))
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']
    report_utils.homozygous_genotyping_analysis(ug_hcr_data, categories, "ug_hcr_homozygous")

## 4. Performance over exome <a class="anchor" id="exome"></a>
Performance over exome regions (defined in exome_column_name)

[top](#top)

In [None]:
exome_data = data[data[exome_column_name]].copy()

if verbosity > 1:
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']
else:
    categories = ['SNP', 'Indel']
    
report_utils.basic_analysis(exome_data, categories, 'exome', 'exome_sec_data')


In [None]:
if verbosity > 1:
    pd.options.display.max_rows = None
    pd.options.display.float_format = '{:.2f}'.format
    anchor = report_utils.get_anchor('exome_indel_errors')
    present_columns = [x for x in ['alleles',  'call', 'base', 'gt_ultima', 'gt_ground_truth', 'ad',
                                    'max_vaf', 'ug_hcr', 'mappability.0', 'hmer_length'] if x in exome_data.columns]
    indel_errors = exome_data['indel'] & ((exome_data['fp'] & (exome_data['filter'] == 'PASS')) | exome_data['fn'])
    hmer_indel_errors = indel_errors & (exome_data['hmer_length'] > 0)
    non_hmer_indel_errors = indel_errors & (exome_data['hmer_length'] == 0)
    display(Markdown(f'### Hmer INDEL Errors: {anchor}'))
    display(exome_data[hmer_indel_errors][present_columns])
    display(Markdown(f'### Non-hmer INDEL Errors: {anchor}'))
    display(exome_data[non_hmer_indel_errors][present_columns])
    anchor = report_utils.get_anchor('exome_snp_errors')
    display(Markdown(f'### SNP Errors: {anchor}'))
    display(exome_data[~exome_data['tp'] & ~exome_data['indel'] & (exome_data['filter'] == 'PASS')][present_columns].head(20))

In [None]:
d = data
if 'well_mapped_coverage' in d.columns and 'mappability.0' in d.columns and verbosity > 1:
    anchor = report_utils.get_anchor('well_mapped_coverage')
    display(Markdown(f"## 5. Performance over regions with coverage>=20 and excluding areas with mappability {anchor}"))    
    good_cov_data = d[(d['well_mapped_coverage'] >= 20) & (d['mappability.0'])].copy()
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                  'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    df = report_utils.basic_analysis(good_cov_data, categories, 'good_cvg_data')
    
    display(Markdown(f"### 5.1 Homozygous genotyping accuracy"))
    display(Markdown(f"The precision and recall of called homozygous variants (where the variant was not classfied as False Negative)"))
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
    report_utils.homozygous_genotyping_analysis(good_cov_data, categories, "good_cvg_data_homozygous")

In [None]:
d = data
if 'callable' in d.columns and verbosity > 1:
    anchor = report_utils.get_anchor('callable')
    display(Markdown(f"## 6. Performance over callable regions {anchor}"))    
    callable_data = d[d['callable']].copy()
    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                  'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10',
                  'hmer Indel >10,<=12', 'hmer Indel >12,<=14', 'hmer Indel >15,<=19', 'hmer Indel >=20']
    df = report_utils.basic_analysis(callable_data, categories, 'callable_data')
  

In [None]:
if verbosity > 2:
    anchor = report_utils.get_anchor('indel_analysis')
    display(Markdown(f"## 7. Indel analysis\n {anchor}"
    "For each factor, show the following histograms\n"
    "* Number of false positive variants per bin\n"
    "* Number of true positive variants per bin\n"
    "* Number of false negative variants per bin\n"
    "* precision per bin (as line plot)\n"
    "* recall per bin (as lint plot)\n\n"
    "Notes:\n"
    "* Insertions / Deletions are displayed separately (two colors)\n"
    "* homopolymer deletions and non-homopolymer deletions are displayed separately (separate plots)\n\n"
    "[top](#top)"))
    anchor = report_utils.get_anchor('all_data_indel_analysis')
    display(Markdown(f"### 7.1 All data {anchor}"))
    report_utils.indel_analysis(data, 'wg')
    anchor = report_utils.get_anchor('ug_hcr_indel_analysis')
    display(Markdown(f"### 7.2 UG-HCR {anchor}"))
    report_utils.indel_analysis(ug_hcr_data, 'ug-hcr')
    anchor = report_utils.get_anchor('exome_indel_analysis')
    display(Markdown(f"### 7.3 exome {anchor}"))
    report_utils.indel_analysis(exome_data, 'exome')