In [None]:
from IPython.display import Image, display

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import os
import sys

for path in [
    os.path.join(os.environ["HOME"], "proj/VariantCalling"),
    "/VariantCalling",
]:
    if os.path.isdir(path) and path not in sys.path:
        sys.path.insert(1, path)
from ugvc.mrd.balanced_strand_utils import (
    BalancedStrandAdapterVersions,
    HistogramColumnNames,
    STRAND_RATIO_LOWER_THRESH,
    STRAND_RATIO_UPPER_THRESH,
    MIN_TOTAL_HMER_LENGTHS_IN_LOOPS,
    MAX_TOTAL_HMER_LENGTHS_IN_LOOPS,
    _assert_adapter_version_supported
)

In [None]:
IMAGE_WIDTH = 800

In [None]:
# input parameters
adapter_version = None
statistics_h5 = None
trimmer_histogram_png = None
strand_ratio_png = None
strand_ratio_category_png = None
strand_ratio_category_concordance_png = None
sr_lower = STRAND_RATIO_LOWER_THRESH
sr_upper = STRAND_RATIO_UPPER_THRESH
min_total_hmer_lengths_in_loops = MIN_TOTAL_HMER_LENGTHS_IN_LOOPS
max_total_hmer_lengths_in_loops = MAX_TOTAL_HMER_LENGTHS_IN_LOOPS
illustration_file = None

In [None]:
if (
    statistics_h5 is None
    or adapter_version is None
    or trimmer_histogram_png is None
    or strand_ratio_category_png is None
):
    raise ValueError("Missing required input files")

In [None]:
_assert_adapter_version_supported(adapter_version)

# Main statistics

In [None]:
df_sorter_stats_shortlist = pd.read_hdf(statistics_h5, key="stats_shortlist").dropna()
display(df_sorter_stats_shortlist.style.format("{:.2f}"))
print("\n\n\n\n")

- "MIXED read mean coverage" is the coverage of reads where both loops were detected as MIXED
- "% MIXED (both tags) of all reads" measured the ratio of reads where both loops were detected as MIXED out of all the reads
- "% MIXED reads (both tags) where end was reached" measured the ratio of reads where both loops were detected as MIXED out of the reads where the read end was reached so that the end loop could be measured

# QC plots

In [None]:
display(Image(strand_ratio_category_png, width=IMAGE_WIDTH))
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.
The end loop breakdown is only for the reads that reached the end loop.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.
The categories are reported separately for the start- and end-loops.
The end loop breakdown is shown only for the reads that reached the end loop.""",
    BalancedStrandAdapterVersions.LA_v7.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.
The categories are reported separately for the start- and end-loops.
The end loop breakdown is shown only for the reads that reached the end loop.""",
}
print(caption[adapter_version])
print("\n\n\n\n")

In [None]:
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This barplot shows the MINUS/PLUS ratio distribution for all the reads.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This barplot shows the MINUS/PLUS ratio distribution for all the reads.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This barplot shows the MINUS/PLUS ratio distribution for all the reads, for each loop separately.""",
}
if adapter_version in caption and strand_ratio_png:
    display(Image(strand_ratio_png, width=IMAGE_WIDTH))
    print(caption[adapter_version])
    print("\n\n\n\n")

In [None]:
if strand_ratio_category_concordance_png is not None:
    display(Image(strand_ratio_category_concordance_png, width=IMAGE_WIDTH))
    caption = {
        BalancedStrandAdapterVersions.LA_v5and6.value: """
These plots show the concordance between the strand ratio categories of the start-loop and end-loop. Each loop is assigned a category separately, and the concordance is plotted. The top plot includes all the reads, including those with END_UNREACHED, while the bottom includes reads where the end was reached only.""",
        BalancedStrandAdapterVersions.LA_v7.value: """
These plots show the concordance between the strand ratio categories of the start-loop and end-loop. Each loop is assigned a category separately, and the concordance is plotted. The top plot includes all the reads, including those with END_UNREACHED, while the bottom includes reads where the end was reached only.""",
    }
    print(caption[adapter_version])
print("\n\n\n\n")

In [None]:
display(Image(trimmer_histogram_png, width=IMAGE_WIDTH))
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This plot shows the homopolymers called in the A and T hmers in the start loop.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This plot shows the homopolymers called in the A and T hmers in the end loop.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This plot shows the homopolymers called in the A and T hmers in the start loop (left) and end loop (right).""",
    BalancedStrandAdapterVersions.LA_v7.value: """
This plot shows the homopolymers called in the A, T, G and C hmers in the start loop (left) and in the T, G, C, A hmers in the end loop (right).
The loops are expected to yield:
- A signel of [1 1 1 1], AGCT and GCAT for the start and end loops, for MIXED reads
- A signal of [0 2 0 2], TTCC and CCTT for the start and end loops, for MINUS-only reads
- A signal of [2 0 2 0], AAGG and GGAA for the start and end loops, for PLUS-only reads
""",
}
print(caption[adapter_version])
print("\n\n\n\n")

# About PPM-Seq

Identifying single nucleotide variants (SNVs) is fundamental to genomics. While consensus mutation calling, requiring multiple variant-containing reads to call genetic variation, is often used, it is unsuitable in calling rare SNVs, such as in circulating tumor DNA or somatic mosaicism, where often only a single supporting read is available. Paired Plus and Minus strand Sequencing (PPM-Seq), a PCR-free library preparation technology that uniquely leverages the Ultima Genomics clonal amplification process, overcomes this challenge. Here, DNA denaturation is not required prior to clonal amplification so both native strands are clonally amplified on many sequencing beads, allowing for a linear increase in duplex recovery and scalable duplex coverage without requiring unique molecular identifiers or redundant sequencing. 

In PPM-Seq, modified Ultima Genomics adapters containing mismatched homopolymers are used to detect reads that are the result of the mixture of the two native DNA strands. While some reads are amplicons of only the Plus or Minus strands and are generally of typical UG read SNV accuracy, the so-called Mixed reads exhibit much lower error rates, well below 1E-6, facilitating the accurate detection of rare SNVs. Artifactual mutations manifesting on one strand only are common sources of error in SNV detection from NGS. While beads that are amplicons of Plus or Minus strand only are exposed to these artifacts that would appear as high-quality reads, in Mixed beads they create an inconsistent signal that translates into a low quality base or read, preventing them from being read as false positive SNVs. 

This report is generated from preprocessing of the PPM-Seq sequencing data, and is intended to be used as a QC report for the library prep and sequencing run. The distribution of the MINUS/PLUS ratio, assignment of reads to categories (MIXED/MINUS/PLUS/UNDETERMINED), and with the raw calls are shown.

In [None]:
if illustration_file is not None:
    display(Image(illustration_file, width=IMAGE_WIDTH))

# PPM-Seq adapter version

In [None]:
adapter_version_desc = {
    BalancedStrandAdapterVersions.LA_v5.value: f"""
The LA_v5 (LA=Loop Adapter) adapter is used in this sample. It is composed of a 6A-6A loop in the start of the read, so that reads are expected to ideally yield:
- 0A and 6T for MINUS-only reads
- 6A and 0T for PLUS-only reads
- 3A and 3T for 50% MINUS - 50% PLUS reads

In practice, homopolymer errors are allowed according to the spec below.

The MINUS/PLUS strand ratio is calculated as:
MINUS/PLUS = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
MINUS+PLUS = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_loops}<=MINUS+PLUS<={max_total_hmer_lengths_in_loops} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the MINUS/PLUS strand ratio is interpreted to read categories as follows:
- MINUS-only: MINUS/PLUS = 1
- PLUS-only: MINUS/PLUS = 0
- MIXED: {sr_lower} <= MINUS/PLUS <= {sr_upper}
- UNDETERMINED: (0 < MINUS/PLUS < {sr_lower}) or ({sr_upper} < MINUS/PLUS < 1)
""",
    BalancedStrandAdapterVersions.LA_v6.value: f"""
The LA_v6 (LA=Loop Adapter) adapter is used in this sample. It is composed of a 6A-6A loop in the end of the read, so that reads are expected to ideally yield:
- 0A and 6T for MINUS-only reads
- 6A and 0T for PLUS-only reads
- 3A and 3T for 50% MINUS - 50% PLUS reads

In practice, homopolymer errors are allowed according to the spec below.

The MINUS/PLUS strand ratio is calculated as:
MINUS/PLUS = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
MINUS+PLUS = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_loops}<=MINUS+PLUS<={max_total_hmer_lengths_in_loops} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the MINUS/PLUS strand ratio is interpreted to read categories as follows:
- MINUS-only: MINUS/PLUS = 1
- PLUS-only: MINUS/PLUS = 0
- MIXED: {sr_lower} <= MINUS/PLUS <= {sr_upper}
- UNDETERMINED: (0 < MINUS/PLUS < {sr_lower}) or ({sr_upper} < MINUS/PLUS < 1)

Additionally, since the loop is at the end of the reads it is not necessarily reached, in which case the reads is annotated as END_UNREACHED.
""",
    BalancedStrandAdapterVersions.LA_v5and6.value: f"""
The PPM-Seq-v1 adapter is used in this sample. It is composed of a 6A-6A loop in the start and in the end of the read, so that reads are expected to ideally yield in each loop:
- 0A and 6T for MINUS-only reads
- 6A and 0T for PLUS-only reads
- 3A and 3T for 50% MINUS - 50% PLUS reads

In practice, homopolymer errors are allowed according to the spec below.

The MINUS/PLUS strand ratio is calculated as:
MINUS/PLUS = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
MINUS+PLUS = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_loops}<=MINUS+PLUS<={max_total_hmer_lengths_in_loops} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the MINUS/PLUS strand ratio is interpreted to read categories as follows:
- MINUS-only: MINUS/PLUS = 1
- PLUS-only: MINUS/PLUS = 0
- MIXED: {sr_lower} <= MINUS/PLUS <= {sr_upper}
- UNDETERMINED: (0 < MINUS/PLUS < {sr_lower}) or ({sr_upper} < MINUS/PLUS < 1)

Additionally, since the end loop is at the end of the reads it is not necessarily reached, in which case the loop is annotated as END_UNREACHED.""",
    BalancedStrandAdapterVersions.LA_v7.value: f"""
The PPM-Seq-v2 adapter is used in this sample. It is composed of an AAGG-AAGG loop in the start and a GGAA-GGAA loop in the end of the read, so that reads are expected to ideally yield in each loop:
- TTCC and CCTT for MINUS-only reads
- AAGG and GGAA for PLUS-only reads
- AGCT and GCAT for 50% MINUS - 50% PLUS reads

Up to 2 homopolymer errors are allowed, as long as the distance from the second best fit is at least 4.

Additionally, since the end loop is at the end of the reads it is not necessarily reached, in which case the loop is annotated as END_UNREACHED.""",
}
print(adapter_version_desc[adapter_version])
print("\n\n\n\n")

# Detailed statistics

In [None]:
with pd.HDFStore(statistics_h5) as s:
    keys = s.keys()
for key in sorted(keys):
    if key in ["/sorter_stats_shortlist", "sorter_stats_shortlist"]:  # already shown
        continue
    print(f"\n\nStatistics table:{key.replace('/', ' ')}")
    df = pd.read_hdf(statistics_h5, key=key)
    if key in ["trimmer_histogram", "/trimmer_histogram"]:
        df = df.sort_values(
            HistogramColumnNames.COUNT_NORM.value, ascending=False
        ).reset_index(drop=True)
    display(df.head(50))
print("\n\n\n\n")