# Introduction

In [None]:
from IPython.display import Image, display

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import os
import sys

for path in [
    os.path.join(os.environ["HOME"], "proj/VariantCalling"),
    "/VariantCalling",
]:
    if os.path.isdir(path) and path not in sys.path:
        sys.path.insert(1, path)
from ugvc.mrd.balanced_strand_utils import (
    BalancedStrandAdapterVersions,
    BalancedCategories,
    HistogramColumnNames,
    STRAND_RATIO_LOWER_THRESH,
    STRAND_RATIO_UPPER_THRESH,
    MIN_TOTAL_HMER_LENGTHS_IN_TAGS,
    MAX_TOTAL_HMER_LENGTHS_IN_TAGS,
    _assert_adapter_version_supported
)

In [None]:
IMAGE_WIDTH = 800

In [None]:
# input parameters
adapter_version = None
statistics_h5 = None
trimmer_histogram_png = None
strand_ratio_png = None
strand_ratio_category_png = None
strand_ratio_category_concordance_png = None
sr_lower = STRAND_RATIO_LOWER_THRESH
sr_upper = STRAND_RATIO_UPPER_THRESH
min_total_hmer_lengths_in_tags = MIN_TOTAL_HMER_LENGTHS_IN_TAGS
max_total_hmer_lengths_in_tags = MAX_TOTAL_HMER_LENGTHS_IN_TAGS
illustration_file = None

In [None]:
if (
    statistics_h5 is None
    or adapter_version is None
    or trimmer_histogram_png is None
    or strand_ratio_png is None
    or strand_ratio_category_png is None
):
    raise ValueError("Missing required input files")

In [None]:
_assert_adapter_version_supported(adapter_version)

Balanced strand sequencing yields a measurement of the ratio between amplicons originating from one strand of the original dsDNA molecule vs the other. The strands are denoted LIG and HYB, for the strand that undergoes ligation to the bead primer and the strand that hybridizes to the bead primer respectively. The LIG/HYB strand ratio is measured via mismatched homopolymer sequences in the adapter, in the start and potentially in the end of the read. This ratio is an indication of the expected error rate in the SNV, where reads that are mixed (LIG/HYB ~ 50%) are expected to have a lower error rate due to an innate correction of library artifacts appearing on one strand only, while the LIG- or HYB-only are exposed to such errors.

This report is generated from preprocessing of the balanced strand sequencing data, and is intended to be used as a QC report for the library prep and sequencing run. The report shows the distribution of the LIG/HYB ratio, the assignment of beads to categories (MIXED/LIG/HYB/UNDETERMINED), along with the raw calls.

In [None]:
if illustration_file is not None:
    display(Image(illustration_file, width=IMAGE_WIDTH))

# Adapter version

In [None]:
adapter_version_desc = {
    BalancedStrandAdapterVersions.LA_v5.value: f"""
The LA_v5 (LA=Loop Adapter) adapter is used in this sample. It is composed of a 6A-6A loop in the start of the read, so that reads are expected to ideally yield:
- 0A and 6T for LIG-only reads
- 6A and 0T for HYB-only reads
- 3A and 3T for 50% LIG - 50% HYB reads

In practice, homopolymer errors are allowed according to the spec below.

The LIG/HYB strand ratio is calculated as:
LIG/HYB = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
LIG+HYB = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_tags}<=LIG+HYB<={max_total_hmer_lengths_in_tags} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the LIG/HYB strand ratio is interpreted to read categories as follows:
- LIG-only: LIG/HYB = 1
- HYB-only: LIG/HYB = 0
- MIXED: {sr_lower} <= LIG/HYB <= {sr_upper}
- UNDETERMINED: (0 < LIG/HYB < {sr_lower}) or ({sr_upper} < LIG/HYB < 1)
""",
    BalancedStrandAdapterVersions.LA_v6.value: f"""
The LA_v6 (LA=Loop Adapter) adapter is used in this sample. It is composed of a 6A-6A loop in the end of the read, so that reads are expected to ideally yield:
- 0A and 6T for LIG-only reads
- 6A and 0T for HYB-only reads
- 3A and 3T for 50% LIG - 50% HYB reads

In practice, homopolymer errors are allowed according to the spec below.

The LIG/HYB strand ratio is calculated as:
LIG/HYB = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
LIG+HYB = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_tags}<=LIG+HYB<={max_total_hmer_lengths_in_tags} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the LIG/HYB strand ratio is interpreted to read categories as follows:
- LIG-only: LIG/HYB = 1
- HYB-only: LIG/HYB = 0
- MIXED: {sr_lower} <= LIG/HYB <= {sr_upper}
- UNDETERMINED: (0 < LIG/HYB < {sr_lower}) or ({sr_upper} < LIG/HYB < 1)

Additionally, since the tag is at the end of the reads it is not necessarily reached, in which case the reads is annotated as END_UNREACHED.
""",
    BalancedStrandAdapterVersions.LA_v5and6.value: f"""
The LA_v5+6 (LA=Loop Adapter) adapter is used in this sample. It is composed of a 6A-6A loop in the start and in the end of the read, so that reads are expected to ideally yield in each tag:
- 0A and 6T for LIG-only reads
- 6A and 0T for HYB-only reads
- 3A and 3T for 50% LIG - 50% HYB reads

In practice, homopolymer errors are allowed according to the spec below.

The LIG/HYB strand ratio is calculated as:
LIG/HYB = T_hmer / (A_hmer + T_hmer)

The sum of the lengths of the hmers is:
LIG+HYB = A_hmer + T_hmer

Values with {min_total_hmer_lengths_in_tags}<=LIG+HYB<={max_total_hmer_lengths_in_tags} are considered as valid, the rest are denoted as UNDETERMINED.
Following that filter, the LIG/HYB strand ratio is interpreted to read categories as follows:
- LIG-only: LIG/HYB = 1
- HYB-only: LIG/HYB = 0
- MIXED: {sr_lower} <= LIG/HYB <= {sr_upper}
- UNDETERMINED: (0 < LIG/HYB < {sr_lower}) or ({sr_upper} < LIG/HYB < 1)

Additionally, since the end tag is at the end of the reads it is not necessarily reached, in which case the tag is annotated as END_UNREACHED.""",
}
print(adapter_version_desc[adapter_version])
print("\n\n\n\n")

# Main statistics

In [None]:
df_sorter_stats_shortlist = pd.read_hdf(statistics_h5, key="stats_shortlist")
display(df_sorter_stats_shortlist.style.format("{:.2f}"))
print("\n\n\n\n")

# Strand ratio plots

In [None]:
display(Image(strand_ratio_category_png, width=IMAGE_WIDTH))
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.
The end tag breakdown is only for the reads that reached the end tag.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This barplot shows the ratio of each category type in the data according to the spec in the top of the file.
The categories are reported separately for the start- and end-tags.
The end tag breakdown is shown only for the reads that reached the end tag.""",
}
print(caption[adapter_version])
print("\n\n\n\n")

In [None]:
display(Image(strand_ratio_png, width=IMAGE_WIDTH))
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This barplot shows the LIG/HYB ratio distribution for all the reads.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This barplot shows the LIG/HYB ratio distribution for all the reads.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This barplot shows the LIG/HYB ratio distribution for all the reads, for each tag separately.""",
}
print(caption[adapter_version])
print("\n\n\n\n")

In [None]:
if strand_ratio_category_concordance_png is not None:
    display(Image(strand_ratio_category_concordance_png, width=IMAGE_WIDTH))
    caption = {
        BalancedStrandAdapterVersions.LA_v5and6.value: """
These plots show the concordance between the strand ratio categories of the start-tag and end-tag. Each tag is assigned a category separately, and the concordance is plotted. The top plot includes all the reads, including those with END_UNREACHED, while the bottom includes reads where the end was reached only.""",
    }
    print(caption[adapter_version])
print("\n\n\n\n")

In [None]:
display(Image(trimmer_histogram_png, width=IMAGE_WIDTH))
caption = {
    BalancedStrandAdapterVersions.LA_v5.value: """
This plot shows the homopolymers called in the A and T hmers in the start tag.""",
    BalancedStrandAdapterVersions.LA_v6.value: """
This plot shows the homopolymers called in the A and T hmers in the end tag.""",
    BalancedStrandAdapterVersions.LA_v5and6.value: """
This plot shows the homopolymers called in the A and T hmers in the start tag (left) and end tag (right).""",
}
print(caption[adapter_version])
print("\n\n\n\n")

# Statistics

In [None]:
with pd.HDFStore(statistics_h5) as s:
    keys = s.keys()
for key in sorted(keys):
    if key in ["/sorter_stats_shortlist", "sorter_stats_shortlist"]:  # already shown
        continue
    print(f"\n\nStatistics table:{key.replace('/', ' ')}")
    df = pd.read_hdf(statistics_h5, key=key)
    if key in ["trimmer_histogram", "/trimmer_histogram"]:
        df = df.sort_values(
            HistogramColumnNames.COUNT_NORM.value, ascending=False
        ).reset_index(drop=True)
    display(df.head(50))
print("\n\n\n\n")