## Variant Calling Report v1.2.6
## 1. Input Parameters

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nexusplt as nxp
from ugvc.reports.report_utils import ShortReportUtils, parse_config
from IPython.display import display, Markdown

pd.options.display.float_format = '{:,.2%}'.format
pd.set_option('display.max_colwidth', None)



In [2]:
parameters, param_names = parse_config('var_report.config')
image_dir = 'plots'
image_prefix = parameters['image_prefix']
reference_version = parameters['reference_version']
sources = parameters['sources']
h5outfile = parameters['h5outfile']
trained_w_gt = parameters['trained_w_gt']
data = {'Trained wo gt': pd.read_hdf(parameters['h5_concordance_file'], key="concordance", mode='r')}
report_utils = ShortReportUtils(image_dir, image_prefix)

In [3]:
columns_to_select = None
rename_dict = None
if reference_version == "hg38":
    columns_to_select = ['indel', 'hmer_indel_length', 'tree_score',
                         'filter', 'blacklst', 'classify', 'classify_gt',
                         'indel_length', 'hmer_indel_nuc', 'ref',
                         'gt_ground_truth', 'well_mapped_coverage', 'mappability.0', 'ug_hcr', 'LCR-hs38']
    rename_dict = {'LCR-hs38': 'LCR'}
elif reference_version == "hg19":
    columns_to_select = ['indel', 'hmer_indel_length', 'tree_score',
                         'filter', 'blacklst', 'classify', 'classify_gt',
                         'indel_length', 'hmer_indel_nuc', 'ref',
                         'gt_ground_truth', 'well_mapped_coverage', 'LCR-hg19_tab_no_chr',
                         'mappability.hg19.0_tab_no_chr', 'ug_hcr_hg19_no_chr']
    rename_dict = {'LCR-hg19_tab_no_chr': 'LCR',
                   'mappability.hg19.0_tab_no_chr': 'mappability.0',
                   'ug_hcr_hg19_no_chr': 'ug_hcr'}


# Load the concordance data for the entire genome
with pd.HDFStore(parameters['h5_concordance_file']) as hdf:
    keys = hdf.keys()
    wg_dfs = []
    for k in keys:
        if k in ['/concordance', '/input_args']:
            continue
        else:
            tmp = pd.read_hdf(hdf, k)
            tmp = tmp[[x for x in columns_to_select if x in tmp.columns]]
            wg_dfs.append(tmp)
    wg_df = pd.concat(wg_dfs)
    wg_df.rename(columns=rename_dict, inplace=True)

for s in sources:
    data[s] = data[s].rename(columns=rename_dict)
data['whole genome'] = wg_df

sources['whole genome'] = (parameters['h5_concordance_file'], "all")

In [None]:
if 'well_mapped_coverage' in data['whole genome'].columns:
    parameters['mean_var_depth'] = '{:.2f}'.format(data['whole genome']['well_mapped_coverage'].mean())
    param_names.append('mean_var_depth')

try:
    args = pd.read_hdf(sources['Trained wo gt'][0], 'input_args', mode='r')
    parameters['truth_sample_name'] = args['truth_sample_name'][0]
except:
    parameters['truth_sample_name'] = parameters['truth_sample_name']

param_names.append('truth_sample_name')

prmdf = pd.DataFrame.from_dict(parameters, orient='index', columns=['value']).reindex(param_names)
prmdf.to_hdf(h5outfile, key="parameters")

prmdf

## 2. Performance over all Data
The concordance between the variant calling results and the ground truth sample is presented below.
* Red line - precision and recall over different tree-scores.
* Red dot - precision and recall values for the chosen threshold.
* Black dot -precision and recall after filtering systematic errors (SEC)

In [None]:
sec_df = data['whole genome'].copy()
data_SEC = {}
if 'blacklst' in sec_df.columns:
    is_sec = sec_df['blacklst'].apply(report_utils.has_sec)
    sec_df.loc[is_sec, 'filter'] = "SEC"
    sec_df.loc[is_sec & (sec_df['classify_gt'] == 'tp'), 'classify_gt'] = "fn"
    sec_df_new = sec_df[~(is_sec & (sec_df['classify_gt'] == 'fp'))]
    data_SEC = {'whole genome': sec_df_new, 'Trained wo gt': data['Trained wo gt'].copy()}

In [None]:

categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
optTab1, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
SEC_opt_tab1 = None
SEC_opt_tab2 = None
if data_SEC:
    SEC_opt_tab1, SEC_opt_res, SEC_perf_curve = report_utils.get_performance(data_SEC, categories, sources)
    report_utils.plot_performance(perf_curve,
                                  opt_res,
                                  categories,
                                  img='all.primary',
                                  sources={'whole genome': sources['whole genome']},
                                  opt_res_sec=SEC_opt_res)
else:
    report_utils.plot_performance(perf_curve,
                                  opt_res,
                                  categories,
                                  img='all.primary',
                                  sources={'whole genome': sources['whole genome']})

In [None]:
categories = ['hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
optTab2, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
if data_SEC:
    SEC_opt_tab2, SEC_opt_res, SEC_perf_curve = report_utils.get_performance(data_SEC, categories, sources)
    report_utils.plot_performance(perf_curve,
                                  opt_res,
                                  categories,
                                  img='all.primary',
                                  sources={'whole genome': sources['whole genome']},
                                  opt_res_sec=SEC_opt_res)
else:
    report_utils.plot_performance(perf_curve, opt_res, categories,
                                  img='all.primary',
                                  sources={'whole genome': sources['whole genome']})

In [None]:
pd.options.display.float_format = '{:,.2%}'.format

optTab = {}
for s in sources:
    optTab[s] = pd.concat([optTab1[s], optTab2[s]])

df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="all_data")

if data_SEC:
    df_SEC = pd.concat([SEC_opt_tab1['whole genome'], SEC_opt_tab2['whole genome']])
    df_SEC.to_hdf(h5outfile, key="sec_data")
    display(pd.concat([df['whole genome'], df_SEC], keys=['Whole genome', 'After filtering systematic errors'], axis=1))
else:
    display(df[['whole genome']])

### 2.1 Homozygous genotyping accuracy

The precision and recall of called homozygous variants (where the variant was not classfied as False Negative)

In [None]:
categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']

hmzData = {}
for s in sources:
    d = data[s]
    hmzData[s] = d[(d['gt_ground_truth'] == (1, 1)) & (d['classify'] != 'fn')]
optTab, opt_res, perf_curve = report_utils.get_performance(hmzData, categories, sources)
df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="all_data_homozygous")

display(df[['whole genome']])

### 2.2 Stratified by base

#### (A,T)

In [None]:
categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10', 'hmer Indel 8']

baseData = {}
b = ('A', 'T')
for s in sources:
    d = data[s]
    baseData[s] = d[((d['indel'] == False) & ((d['ref'] == b[0]) | (d['ref'] == b[1]))) |
                    ((d['hmer_indel_length'] > 0) & ((d['hmer_indel_nuc'] == b[0]) | (d['hmer_indel_nuc'] == b[1])))
                    ]
optTab1, opt_res, perf_curve = report_utils.get_performance(baseData, categories, sources)
for s in sources:
    optTab1[s].rename(index={a: '{0} ({1}/{2})'.format(a, b[0], b[1]) for a in optTab1[s].index}, inplace=True)
report_utils.plot_performance(perf_curve, opt_res, categories, sources={'whole genome': sources['whole genome']})

#### (G,C)

In [None]:
categories = ['SNP', 'Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10', 'hmer Indel 6']
baseData = {}
b = ('C', 'G')
for s in sources:
    d = data[s]
    baseData[s] = d[((d['indel'] == False) & ((d['ref'] == b[0]) | (d['ref'] == b[1]))) |
                    ((d['hmer_indel_length'] > 0) & ((d['hmer_indel_nuc'] == b[0]) | (d['hmer_indel_nuc'] == b[1])))
                    ]
optTab2, opt_res, perf_curve = report_utils.get_performance(baseData, categories, sources)
for s in sources:
    optTab2[s].rename(index={a: '{0} ({1}/{2})'.format(a, b[0], b[1]) for a in optTab2[s].index}, inplace=True)
report_utils.plot_performance(perf_curve, opt_res, categories, sources={'whole genome': sources['whole genome']})

In [None]:
optTab = {}
for s in sources:
    optTab[s] = pd.concat([optTab1[s], optTab2[s]])
df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="all_data_per_base")
display(df[['whole genome']])

## 3. Performance over UG high confidence regions

Variant calling peformance exclusing genomic areas where UG performance is poor, i.e:
- Homopolymers - runs of length 11 bp and above, padded with four bases around the genomic coordinates,
- AT-rich regions - bases where the GC content of the surrounding 40 bases is lower than 5%,
- Tandem repeats,
- Low mapping quality - regions that are covered by at least 20 reads, but less than 10% of these reads are aligned with mapping quality > 20,
- High coverage variability - regions with coverage that is highly variable between samples (std/mean > 0.5)

In [None]:
filtData = {}
for s in sources:
    d = data[s]
    if s == 'whole genome':
        filtData[s] = d.query("ug_hcr==True").copy()
    else:
        filtData[s] = d.copy()


In [None]:
sec_df = filtData['whole genome'].copy()
data_SEC = None
if 'blacklst' in sec_df.columns:
    is_sec = sec_df['blacklst'].apply(report_utils.has_sec)
    sec_df.loc[is_sec, 'filter'] = "SEC"
    sec_df.loc[is_sec & (sec_df['classify_gt'] == 'tp'), 'classify_gt'] = "fn"
    sec_df_new = sec_df[~(is_sec & (sec_df['classify_gt'] == 'fp'))]
    data_SEC = {'whole genome': sec_df_new, 'Trained wo gt': filtData['Trained wo gt'].copy()}

In [None]:
categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
optTab1, opt_res, perf_curve = report_utils.get_performance(filtData, categories, sources)
if data_SEC:
    SEC_opt_tab1, SEC_opt_res, SEC_perf_curve = report_utils.get_performance(data_SEC, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary', sources={'whole genome': sources['whole genome']},
                                  opt_res_sec=SEC_opt_res)
else:
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary', sources={'whole genome': sources['whole genome']})

In [None]:
categories = ['hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
optTab2, opt_res, perf_curve = report_utils.get_performance(filtData, categories, sources)
if data_SEC:
    SEC_opt_tab2, SEC_opt_res, SEC_perf_curve = report_utils.get_performance(data_SEC, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary', sources={'whole genome': sources['whole genome']},
                                  opt_res_sec=SEC_opt_res)
else:
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary', sources={'whole genome': sources['whole genome']})

In [None]:
pd.options.display.float_format = '{:,.2%}'.format

optTab = {}
for s in sources:
    optTab[s] = pd.concat([optTab1[s], optTab2[s]])

df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="ug_hcr")

if data_SEC:
    df_SEC = pd.concat([SEC_opt_tab1['whole genome'], SEC_opt_tab2['whole genome']])
    df_SEC.to_hdf(h5outfile, key="ug_hcr_sec_data")
    display(pd.concat([df['whole genome'], df_SEC], keys=['Whole genome', 'After filtering systematic errors'], axis=1))
else:
    display(df[['whole genome']])

### 3.1 Homozygous genotyping accuracy

In [None]:
categories = ['SNP', 'Indel', 'non-hmer Indel', 'hmer Indel <=4', 'hmer Indel >4,<=8', 'hmer Indel >8,<=10']

hmzData = {}
for s in sources:
    d = filtData[s]
    hmzData[s] = d[(d['gt_ground_truth'] == (1, 1)) & (d['classify'] != 'fn')]
optTab, opt_res, perf_curve = report_utils.get_performance(hmzData, categories, sources)
df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="ug_hcr_homozygous")
display(df[['whole genome']])

## 4. Performance over regions with coverage>=20 and excluding areas with mappability

In [None]:
if 'well_mapped_coverage' in data['whole genome'].columns:
    filtData = {}
    for s in sources:
        d = data[s]
        filtData[s] = d[(d['well_mapped_coverage'] >= 20) &
                        (d['mappability.0'])
                        ]

    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
    optTab1, opt_res, perf_curve = report_utils.get_performance(filtData, categories, sources)
    report_utils.plot_performance(perf_curve,
                                  opt_res,
                                  categories,
                                  img='hicvg.primary',
                                  sources={'whole genome': sources['whole genome']})

    categories = ['hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    optTab2, opt_res, perf_curve = report_utils.get_performance(filtData, categories, sources)
    report_utils.plot_performance(perf_curve,
                                  opt_res,
                                  categories,
                                  img='hicvg.hmers',
                                  sources={'whole genome': sources['whole genome']}
                                  )

    optTab = {}
    for s in sources:
        optTab[s] = pd.concat([optTab1[s], optTab2[s]])
    df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
    df[['whole genome']].to_hdf(h5outfile, key="good_cvg_data")
    # defTable=df.copy()
    display(df[['whole genome']])
else:
    print("No coverage data available")

### 4.1 Homozygous genotyping accuracy

In [None]:
categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']

hmzData = {}
for s in sources:
    d = filtData[s]
    hmzData[s] = d[(d['gt_ground_truth'] == (1, 1)) & (d['classify_gt'] != 'fn')]
optTab, opt_res, perf_curve = report_utils.get_performance(hmzData, categories, sources)
df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df[['whole genome']].to_hdf(h5outfile, key="good_cvg_data_homozygous")
display(df[['whole genome']])

In [None]:
# A block comparing model with and without ground truth . Only shown if h5_model_file is provided in the config file

In [None]:
if trained_w_gt is not None:
    display(Markdown("""
##  5. Trained with and without Ground Truth
<ul>
<li><b>Trained wo gt - Trained without ground truth </b></li>
Random forest model trained on chromosome 9 using known variants in dbSNP and on common fp variants
<li><b>Trained with gt - Trained with ground truth</b></li>
Simple threshold model trained on chromosome 9 using its own ground truth
</ul>
"""))
    sources['Trained with gt'] = (trained_w_gt, "scored_concordance")
    data['Trained with gt'] = pd.read_hdf(trained_w_gt, "scored_concordance", mode='r')
    data['Trained with gt'] = data['Trained with gt'].rename(columns=rename_dict)

    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8']
    optTab1, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.primary',
                                  sources={'Trained wo gt': sources['Trained wo gt'], 'Trained with gt': sources['Trained with gt']},
                                  legend=True)

    categories = ['hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    optTab2, opt_res, perf_curve = report_utils.get_performance(data, categories, sources)
    report_utils.plot_performance(perf_curve, opt_res, categories, img='all.hmers',
                                  sources={'Trained wo gt': sources['Trained wo gt'], 'Trained with gt': sources['Trained with gt']},
                                  legend=True)

    pd.options.display.float_format = '{:,.2%}'.format
    sources = {'Trained wo gt': sources['Trained wo gt'], 'Trained with gt': sources['Trained with gt']}
    optTab = {}
    for s in ['Trained wo gt', 'Trained with gt']:
        optTab[s] = pd.concat([optTab1[s], optTab2[s]])
    df = pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
    df.to_hdf(h5outfile, key="trained_w_wo_gt")
    display(df)

In [None]:
if ('well_mapped_coverage' in data['whole genome'].columns) and (trained_w_gt is not None):
    filtData = {}
    d = data['Trained with gt']
    filtData['Trained with gt'] = d[(d['well_mapped_coverage'] >= 20) &
                                    (d['mappability.0'])]

    categories = ['SNP', 'Indel', 'non-hmer Indel', 'non-hmer Indel w/o LCR', 'hmer Indel <=4', 'hmer Indel >4,<=8',
                  'hmer Indel 4', 'hmer Indel 5', 'hmer Indel 6', 'hmer Indel 7', 'hmer Indel 8', 'hmer Indel >8,<=10']
    optTab, opt_res, perf_curve = report_utils.get_performance(filtData, categories, {'Trained with gt': sources['Trained with gt']})

    %matplotlib agg
    d = optTab['Trained with gt'][['max recall', 'recall', 'precision']]
    labels = ['SNP', 'Indel', 'nhmer', 'nhmer w/o LCR', 'hmer 2-4', 'hmer 5-8', 'hmer 4', 'hmer 5', 'hmer 6', 'hmer 7',
              'hmer 8', 'hmer 9-10']
    fig = plt.figure()
    ax = d.plot()
    plt.xticks(np.arange(len(d.index)), rotation=30, ha='right')
    ax.set_xticklabels(labels)
    plt.ylim([0.4, 1.05])
    plt.grid()
    plt.title('Cvg>20X, Trained variant calls')
    plt.tight_layout()
    nxp.save(fig, image_prefix + 'summary', 'png', outdir=image_dir)
    plt.close(fig)