# Introduction

This report contains a preliminary automatic analysis of MRD results for one cfDNA sample with one matched signature and potentially a set of control signatures, either from unmatched patients or elsewhere. It characterizes the different signatures in the study in terms of mutation numbers, mutation types (ref&alt bases) and allele fractions, then calculates the tumor fraction for each of the signatures. There is a set of filters applied both to the signatures and to the cfDNA reads (FeatureMap entries), results are shown both with and without those filters.

This notebook can also be used as a template for more refined analyses.

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

In [None]:
for path in ["/home/ubuntu/proj/VariantCalling", "/VariantCalling"]:
    if os.path.isdir(path) and path not in sys.path:
        sys.path.insert(1, path)
from ugvc.utils.misc_utils import set_pyplot_defaults

set_pyplot_defaults()
%matplotlib inline

In [None]:
# input parameters
features_file_parquet = None
signatures_file_parquet = None
signature_filter_query_default = "(norm_coverage <= 2.5) and (norm_coverage >= 0.6)"
signature_filter_query = signature_filter_query_default
read_filter_query_default = "qual>=60"
read_filter_query = read_filter_query_default
featuremap_df_file = None
output_dir = None
basename = None

In [None]:
if features_file_parquet is None:
    raise ValueError(f"Required input features_file_parquet not provided")
if signatures_file_parquet is None:
    raise ValueError(f"Required input signatures_file_parquet not provided")
if featuremap_df_file is None:
    raise ValueError(f"Required input featuremap_df_file not provided")

In [None]:
# read and filter df_features
df_features = (
    pd.read_parquet(features_file_parquet)
    .astype({"rq": float})
    .set_index(["chrom", "pos"])
)

df_features = df_features.assign(
    filtering_ratio=df_features["X_FILTERED_COUNT"] / df_features["X_READ_COUNT"]
)

df_features_filt = df_features.query(read_filter_query)

filtering_ratio = (
    df_features.query("signature_type=='matched'")
    .groupby(level=["chrom", "pos"])
    .agg({"filtering_ratio": "first"})
)

In [None]:
# read and filter df_signatures
df_signatures = (
    pd.read_parquet(signatures_file_parquet)
    .astype({"ug_hcr": bool, "id": bool})
    .set_index(["chrom", "pos"])
)

nunique = (
    df_signatures.groupby(level=["chrom", "pos"])
    .agg({"signature": "nunique"})
    .rename(columns={"signature": "nunique"})
)
nunique.value_counts().rename("count").to_frame().join(
    nunique.value_counts(normalize=True).rename("norm")
)

x = df_signatures.filter(regex="coverage").sum(axis=1)
norm_coverage = (x / x.median()).rename("norm_coverage")
df_signatures = (
    df_signatures.join(nunique)
    .join(
        filtering_ratio,
        how="left",
    )
    .join(norm_coverage, how="left")
    .fillna({"filtering_ratio": 1})
)

df_signatures['ug_mrd_blacklist'] = df_signatures['ug_mrd_blacklist'].astype(bool)

df_signatures_filt = df_signatures.query(signature_filter_query)

# Filters applied 

In [None]:
filter_descriptions = {
    "ug_hcr": "In UG High Confidence Region",
    "giab_hcr": "In GIAB (HG001-007) High Confidence Region",
    "not ug_mrd_blacklist": "Not in UG MRD Blacklist",
    "not id": "Not in dbsnp",
    "af": "Allele fraction filter",
    "filtering_ratio": "Minimum ratio of read passing read filters in locus",
    "nunique == 1": "Locus only in one signature in this cohort",
    "norm_coverage": "Filtering by coverage, normalized to median",
    "X_SCORE": "Filtering by log likelihood score (effective BQ)",
    "X_EDIST": "Filtering by edit distance from the reference",
    "max_softclip_len": "Filtering by maximal softclip length",
    "X_LENGTH": "Filtering by fragment length",
    "rq": "Filtering by read quality",
}
print("Filters applied to signature:")
for x in signature_filter_query.replace("(", "").replace(")", "").split("and"):
    x = x.strip()
    desc = filter_descriptions.get(
        x.split("<")[0].split(">")[0].strip(), "<Description unavailable>"
    )
    print(f"  - {desc}, query='{x}'")
print("\n\n")
print("Filters applied to reads:")
for x in read_filter_query.replace("(", "").replace(")", "").split("and"):
    x = x.strip()
    desc = filter_descriptions.get(
        x.split("<")[0].split(">")[0].strip(), "<Description unavailable>"
    )
    print(f"  - {desc}, query='{x}'")

# Matched signature/s analysis

In [None]:
def plot_signature_mutation_types(df_signatures_in, signature_filter_query_in):
    fig, axs = plt.subplots(1, 2, figsize=(18, 4))
    fig.suptitle(",".join(df_signatures_in["signature"].unique()), y=1.13)
    for ax, column, df_plot in zip(
        axs.flatten(),
        [
            "Unfiltered",
            "Filtered",
        ],
        [
            df_signatures_in,
            df_signatures_in.query(signature_filter_query_in),
        ],
    ):
        x = df_plot["mutation_type"].value_counts(normalize=True).sort_index()
        all_muts = [
            "C->A",
            "C->G",
            "C->T",
            "T->A",
            "T->C",
            "T->G",
        ]
        x = x.reindex(index = all_muts, method = 'bfill', fill_value = 0)
        tot_mutations = df_plot.shape[0]
        plt.sca(ax)
        out = plt.bar(range(6), x, color=["b", "g", "r", "y", "m", "c"])
        for px, py in zip(range(6), x):
            plt.text(px, py + 0.01, f"{py:.1%}", ha="center", fontsize=16)
        plt.ylim(0, ax.get_ylim()[1] + 0.03)
        plt.yticks([])
        plt.xticks(range(6), x.index.values, fontsize=20, rotation=90)
        plt.title(f"{column}, total={tot_mutations:,}", fontsize=28)
    plt.show()


def plot_signature_allele_fractions(df_signatures_in, signature_filter_query_in):
    bins = np.linspace(0, 1, 100)
    fig, axs = plt.subplots(1, 2, figsize=(18, 4), sharey=True)
    fig.suptitle(",".join(df_signatures_in["signature"].unique()), y=1.13)
    for ax, column, df_plot in zip(
        axs.flatten(),
        [
            "Unfiltered",
            "Filtered",
        ],
        [
            df_signatures_in,
            df_signatures_in.query(signature_filter_query_in),
        ],
    ):
        plt.sca(ax)
        x = df_plot["af"].values
        tot_mutations = df_plot.shape[0]
        h, bin_edges = np.histogram(x, bins=bins)
        bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2
        plt.fill_between(
            bin_centers,
            -10,
            h,
            label=f"Median = {np.median(x):.1%}\nMean = {np.mean(x):.1%}",
        )
        plt.legend()
        plt.xlim(0, 1)
        plt.ylim(-1, ax.get_ylim()[1])
        plt.xlabel("AF")
        plt.title(f"{column}, total={tot_mutations:,}", fontsize=28)

## Mutation types

In [None]:
for plot_signature in df_signatures.query("signature_type=='matched'")["signature"].unique():
    plot_signature_mutation_types(
        df_signatures_in=df_signatures.query(f"signature == '{plot_signature}'"),
        signature_filter_query_in=signature_filter_query,
    )

Mutation type distribution before and after applying signature filters. Geenrally a reduction in overall number of mutations is expected, but the distribution should be mostly unchanged - significant changes could be an indication of artefacts and warrant looking into the signature data.

## Allele fractions 

In [None]:
for plot_signature in df_signatures.query("signature_type=='matched'")["signature"].unique():
    plot_signature_allele_fractions(
        df_signatures_in=df_signatures.query(f"signature == '{plot_signature}'"),
        signature_filter_query_in=signature_filter_query,
    )

Allele fraction distribution before and after applying signature filters. Genrally a reduction in overall number of mutations is expected, and potentially a minimal allele fraction filter is applied. Allele fraction is an indication of the tumor sample purity, typical values in the range 40-50% are considered excellent, 30-40% considered good, 20-30% considered okay and below 20% considered low and might affect the validty of the results.

# Tumor fractions 

In [None]:
ZERO_TF_FILLIN = 1e-7

def plot_tf(df_tf_in, title=None):
    try:
        df_tf_matched = df_tf_in.loc[('matched', slice(None)), "tf"]
    except KeyError:
        df_tf_matched = pd.DataFrame({"signature_type": [np.nan], "signature": [np.nan], "tf": [np.nan]}).set_index(["signature_type", "signature"])["tf"]
    try:
        df_tf_control = df_tf_in.loc[('control', slice(None)), "tf"]
    except KeyError:
        df_tf_control = pd.DataFrame({"signature_type": [np.nan], "signature": [np.nan], "tf": [np.nan]}).set_index(["signature_type", "signature"])["tf"]
    try:
        df_tf_db_control = df_tf_in.loc[('db_control', slice(None)), "tf"]
    except KeyError:
        df_tf_db_control = pd.DataFrame({"signature_type": [np.nan], "signature": [np.nan], "tf": [np.nan]}).set_index(["signature_type", "signature"])["tf"]
    
    plt.figure(figsize=(8, 12))
    if title:
        plt.title(title, y=1.02, fontsize=28)
        
    if df_tf_matched.notna().any():
        x = 0.2 * np.ones(df_tf_matched.shape[0])
        y = df_tf_matched.values
        labels = (
            df_tf_matched.index.get_level_values("signature")
            + df_tf_matched.apply(lambda x: f" (TF={x:.1e})").values
        )
        hscat1 = plt.scatter(x, y, s=100, c="#D03020")
        for i, label in enumerate(labels):
            plt.text(
                x[i] + 0.015, y[i], label, ha="left", va="center", fontsize=10, alpha=0.3
            )
    else:
        hscat1 = None

    hbp1 = plt.boxplot(
        df_tf_control,
        positions=[0],
        showfliers=False,
        patch_artist=True,
        boxprops=dict(facecolor="b"),
        whiskerprops=dict(color="b"),
        capprops=dict(color="b"),
    )
    hbp2 = plt.boxplot(
        df_tf_db_control,
        positions=[0],
        showfliers=False,
        patch_artist=True,
        boxprops=dict(facecolor="g"),
        whiskerprops=dict(color="g"),
        capprops=dict(color="g"),
    )
    np.random.seed(3456)
    x = 0.2 + np.random.uniform(-0.1, 0.1, size=df_tf_control.shape[0])
    y = df_tf_control.values
    labels = df_tf_control.index.get_level_values("signature")
    hscat2 = plt.scatter(x, y, s=100, c="#3390DD")
    for i, label in enumerate(labels):
        plt.text(
            x[i] + 0.015, y[i], label, ha="left", va="center", fontsize=10, alpha=0.3
        )
    x = 0.2 + np.random.uniform(-0.1, 0.1, size=df_tf_db_control.shape[0])
    y = df_tf_db_control.values
    labels = df_tf_db_control.index.get_level_values("signature")
    hscat3 = plt.scatter(x, y, s=100, c="g")
    for i, label in enumerate(labels):
        plt.text(
            x[i] + 0.015, y[i], label, ha="left", va="center", fontsize=10, alpha=0.3
        )
    plt.yscale("log")
    plt.xticks([])
    plt.xlim(-0.2, 0.5)
    plt.ylabel("Measured tumor fraction")
    plt.legend(
        [hscat1, hscat2, hbp1["boxes"][0], hscat3, hbp2["boxes"][0]],
        [
            "Matched",
            "Individual controls",
            "Control distribution",
            "db_controls",
            "db_control distribution"
        ],
        bbox_to_anchor=[1.01, 1],
    )
    for line in hbp1["medians"]:
        # get position data for median line
        x, y = line.get_xydata()[0]  # top of median line
        # overlay median value
        if not np.isnan(x) and not np.isnan(y):
            plt.text(
                x, y, f"{y:.1e}", ha="right", va="center", color="b", fontsize=16
            )  # draw above, centered
    for line in hbp2["medians"]:
        # get position data for median line
        x, y = line.get_xydata()[0]  # top of median line
        # overlay median value
        if not np.isnan(x) and not np.isnan(y):
            plt.text(
                x, y, f"{y:.1e}", ha="right", va="center", color="g", fontsize=16
        )  # draw above, centered

def get_tf_from_filtered_data(
    df_features_in,
    df_signatures_in,
    display_results=False,
    plot_results=False,
    title=None,
    denom_ratio=None,
):
    df_features_in_intersected = (
        df_features_in.join(
            df_signatures_in.groupby(level=["chrom", "pos"])
            .size()
            .astype(bool)
            .rename("locus_in_signature"),
            how="inner",
        )
        .dropna(subset=["locus_in_signature"])
        .query("locus_in_signature")
        .drop(columns=["locus_in_signature"])
    )  # retaun only loci that are in the signature df - they might have been filtered out
    df_supporting_reads_per_locus = (
        df_features_in_intersected.reset_index()
        .groupby(["chrom", "pos", "signature", "signature_type"])
        .size()
        .rename("supporting_reads")
        .reset_index(level=["signature", "signature_type"])
    )
    df_supporting_reads = (
        (df_supporting_reads_per_locus.groupby(["signature_type", "signature"]).sum())
        .fillna(0)
        .astype(int)
    )
    # fill in coverage for singatures with zero supporting reads
    df_supporting_reads = pd.concat(
        (df_supporting_reads, 
        df_signatures_in.groupby(["signature_type", "signature"])['id'].sum())
    ).fillna(0).astype(int).drop(columns = [0]).groupby(["signature_type", "signature"]).sum()

    df_coverage = (
        (df_signatures_in.groupby("signature").agg({"coverage": "sum"}))
        .fillna(0)
        .astype(int)
    )
    df_tf = df_supporting_reads.join(df_coverage).fillna(0)
    df_tf["corrected_coverage"] = df_tf["coverage"] * denom_ratio
    df_tf["corrected_coverage"] = np.ceil(df_tf["corrected_coverage"])
    df_tf = df_tf.assign(tf=df_tf["supporting_reads"] / df_tf["corrected_coverage"]).sort_index(
        ascending=False
    )
    # fill zero tf with a fixed value
    df_tf = df_tf.assign(tf=df_tf["tf"].replace(0, ZERO_TF_FILLIN))
    
    if plot_results:
        plot_tf(df_tf, title=title)
        plt.show()

    if display_results:
        display(
            df_tf.sort_index(ascending=False).style.format(
                {"coverage": int, "corrected_coverage": int, "supporting_reads": int, "tf": "{:.1e}"}
            )
        )

    return (df_tf, df_supporting_reads_per_locus)

### Tumor fraction denominator
Tumor fraction = # reads intersect with signature / # all reads. <br>
To account for the query filtering, the denominator is mutiplied by the fraction of reads that pass reads filtering in the SRSNV test set. <br>
In case of filtering by QUAL (interpolation of ML_QUAL), the denominator is the fraction of reads with SNVQ > qual_threshold.


In [None]:
denom_ratio = pd.read_parquet(featuremap_df_file).query("label").eval(read_filter_query).mean()

print(f"Denominator ratio: {denom_ratio:.2f}")

## Filtered reads, filtered signatures

In [None]:
df_tf_filt, df_supporting_reads_per_locus_filt = get_tf_from_filtered_data(
    df_features_filt,
    df_signatures_filt,
    display_results=True,
    plot_results=True,
    title="Filtered reads and signatures",
    denom_ratio=denom_ratio,
)

Tumor fractions measured for the cfDNA sample against the matched signature [red], controls signatures [blue], and database controls [green]. The boxplot shows the distribution of the background and the median value is annotated. Using the background information can aid in determining whether a detected results is statistically significant.

## Filtered reads, unfiltered signatures

In [None]:
df_tf_unfilt, df_supporting_reads_per_locus_unfilt = get_tf_from_filtered_data(
    df_features_filt,
    df_signatures,
    display_results=True,
    plot_results=True,
    title="Filtered reads and unfiltered signatures",
    denom_ratio=denom_ratio,
)

## Unfiltered reads, filtered signatures

In [None]:
df_tf_unfilt2, df_supporting_reads_per_locus_unfilt2 = get_tf_from_filtered_data(
    df_features,
    df_signatures_filt,
    display_results=True,
    plot_results=True,
    title="Unfiltered reads and filtered signatures",
    denom_ratio=1
)

In [None]:
## Write tf tables to a hdf file
output_h5_file = os.path.join(output_dir, basename + ".tumor_fraction.h5")
h5_dict = {
    "df_tf_filt_signature_filt_featuremap": df_tf_filt,
    "df_tf_unfilt_signature_filt_featuremap": df_tf_unfilt,
    "df_tf_filt_signature_unfilt_featuremap": df_tf_unfilt2,
    "df_supporting_reads_per_locus_filt_signature_filt_featuremap": df_supporting_reads_per_locus_filt,
    "df_supporting_reads_per_locus_unfilt_signature_filt_featuremap": df_supporting_reads_per_locus_unfilt,
    "df_supporting_reads_per_locus_filt_signature_unfilt_featuremap": df_supporting_reads_per_locus_unfilt2,
}
for key, val in h5_dict.items():
    val.to_hdf(output_h5_file, key=key, mode="a")

# Control signature/s analysis

## Mutation types 

In [None]:
for plot_signature in sorted(
    df_signatures.query("signature_type!='matched'")["signature"].unique()
):
    try:
        plot_signature_mutation_types(
            df_signatures_in=df_signatures.query(f"signature == '{plot_signature}'"),
            signature_filter_query_in=signature_filter_query,
        )
    except Exception as e:
        print(f"Exception when plotting for {plot_signature}:\n{str(e)}")

## Allele fractions 

In [None]:
for plot_signature in sorted(
    df_signatures.query("signature_type!='matched'")["signature"].unique()
):
    try:
        plot_signature_allele_fractions(
            df_signatures_in=df_signatures.query(f"signature == '{plot_signature}'"),
            signature_filter_query_in=signature_filter_query,
        )
    except Exception as e:
        print(f"Exception when plotting for {plot_signature}:\n{str(e)}")

# cfDNA read length distributions 

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20, 10), sharex=True)
fig.subplots_adjust(hspace=0.4, wspace=0.3)

max_value = 0
for ax, title, x in zip(
    axs.flatten(),
    [
        "Matched reads (from tumor)\nunfiltered",
        "Matched reads (from tumor)\nfiltered",
        "Unmatched reads (not tumor)\nunfiltered",
        "Unmatched reads (not tumor)\nfiltered",
    ],
    [
        df_features.query("signature_type!='matched'")["X_LENGTH"],
        df_features.query(f"signature_type!='matched' and {read_filter_query}")["X_LENGTH"],
        df_features.query("signature_type!='matched'")["X_LENGTH"],
        df_features.query(f"signature_type!='matched' and {read_filter_query}")["X_LENGTH"],
    ],
):
    plt.sca(ax)
    plt.title(title, y=1.05, fontsize=28)
    max_value = max(max_value, x.max())
    x.plot.hist(bins=np.arange(0.5, max(250, max_value)))
for ax in axs[-1, :]:
    ax.set_xlabel("Read length", fontsize=32)

Distribution of read lengths for cfDNA reads, both matched and unmatched. Not all of the reads are sequenced through, so the longer reads might be limited by read rather than insert length. Differences in the distributions between matched and unmatched reads could be used for more refined filtering of reads.