# Compare filtered counts and normalized counts distribution
**Note**: Not using directly hicona functions to filter counts and compute normalization factors since they are not meant to be in the final public API and therefore are susceptible to changes, which could thus break the script functionality.

In [1]:
from cooler.util import open_hdf5
import cooltools
import matplotlib.pyplot as plt
from numpy import log2, nan
import pandas as pd
import seaborn as sns

import hicona

In [2]:
def get_filt_counts(cool_obj, chrom_name, max_bin_diff):
    """Get chromosome filtered counts according to HiCONA."""
    
    lower, upper = cool_obj.extent(chrom_name)
    with open_hdf5(cool_obj.store, mode="r") as h5_handle:
        h5_grp = h5_handle[cool_obj.root]
        lo_pix = h5_grp["indexes/bin1_offset"][lower]
        hi_pix = h5_grp["indexes/bin1_offset"][upper]
    
    pix_df = cool_obj.pixels()[lo_pix:hi_pix]
    start_size = pix_df.shape[0]
    indexer = pix_df[
        (pix_df["bin2_id"] - pix_df["bin1_id"] > max_bin_diff)
        | (pix_df["bin1_id"] == pix_df["bin2_id"])
        | (pix_df["bin2_id"] >= upper)
    ].index
    pix_df.drop(indexer, inplace=True)
    end_size = pix_df.shape[0]
    print(f"Start {start_size}, End {end_size}, Lost {(start_size - end_size)/start_size}")
    del indexer
    
    # Add 1 filtering
    start_size = pix_df.shape[0]
    indexer = pix_df[pix_df["count"] == 1].index
    pix_df.drop(indexer, inplace=True)
    end_size = pix_df.shape[0]
    print(f"(Ones) Start {start_size}, End {end_size}, Lost {(start_size - end_size)/start_size}")
    del indexer

    pix_df.drop_duplicates(subset=["bin1_id", "bin2_id"], inplace=True)
    size_diff = end_size - pix_df.shape[0]
    if end_size != 0:
        print(f"Warning, {size_diff} duplicate rows were dropped.")
        
    pix_df["bin_difference"] = pix_df["bin2_id"] - pix_df["bin1_id"]
    
    return pix_df


In [3]:
def get_norm_counts(pix_df): 
    """Get normalized counts according to HiCONA."""

    group_counts = pix_df.groupby("bin_difference")["count"]
    pix_df["exp_ratio"] = log2(pix_df["count"] / group_counts.transform("median") + 1)
    

In [4]:
def count_pixels(pix_df):
    """Compute the number of pixels as a function of distance."""
    
    num_count = pix_df.groupby("bin_difference").count()
    num_count.rename(columns={"count": "all"}, inplace=True)
    one_count = pix_df[pix_df["count"]==1].groupby("bin_difference").count()
    one_count.rename(columns={"count": "ones"}, inplace=True)
    num_count = num_count.merge(one_count, how="left", left_index=True, right_index=True)
    num_count.reset_index(inplace=True)
    num_count = pd.melt(num_count, id_vars=["bin_difference"], value_vars=["all", "ones"])
    num_count.rename(columns={"variable":"pixels"}, inplace=True)
    num_count["bin_difference"] *= res
    
    return num_count


In [5]:
def compute_quantiles(pix_df, normalized=False):
    """Create dataframe with quantiles values."""
    
    var_to_grp = "exp_ratio" if normalized else "count"
    synth = pd.DataFrame(range(1,max_bin_diff), columns=["distance"])
    group_filt = pix_df.groupby("bin_difference")[var_to_grp]
    for quant in QUANTS:
        quant_df = group_filt.quantile(quant).reset_index()
        synth = synth.merge(quant_df, how="left", left_on="distance", right_on="bin_difference")
        synth.rename(columns={var_to_grp: quant}, inplace=True)
        synth.drop("bin_difference", axis=1, inplace=True)
    synth = pd.melt(synth, id_vars=["distance"], value_vars=QUANTS)
    synth.rename(columns={"variable":"quantile", "value":var_to_grp}, inplace=True)
    synth["distance"] *= res
    
    return synth


In [6]:
def create_plot(filt_qts, norm_qts, num_pix):
    """Given the three dataframes the create and save the plot."""
    
    fig, axes = plt.subplots(3,1, sharex=True, figsize=(10,10), gridspec_kw={'height_ratios': [1, 0.8, 0.4]})
    fig.suptitle(f"{chrom}, res: {res:,} bp, file: {F_NAME}", fontsize=24, fontweight=10)
    fig.tight_layout()
    plt.xlim([res, MAX_GEN_DIST])

    sns.lineplot(data=filt_qts, x="distance", y="count", hue="quantile", palette="tab10", ax=axes[0])
    axes[0].set_ylabel("filtered counts", fontsize=16)
    axes[0].set(xscale="log")
    axes[0].tick_params(axis='both', labelsize=12)
    axes[0].legend(loc="upper center", title="Quantiles", fontsize=16, title_fontsize=16)

    sns.lineplot(data=norm_qts, x="distance", y="exp_ratio", hue="quantile", palette="tab10", ax=axes[1])
    axes[1].set_ylabel("normalized counts", fontsize=16)
    axes[1].tick_params(axis='both', labelsize=12)
    axes[1].get_legend().remove()

    sns.lineplot(data=num_pix, x="bin_difference", y="value", hue="pixels", ax=axes[2], palette=["purple", "red"])
    axes[2].set_ylabel("num pixels", fontsize=16)
    axes[2].set(yscale="log")
    axes[2].set_yticks([1, 10, 100, 1000, 10000])
    axes[2].axhline(10, color="grey", linestyle="--", linewidth=2)
    axes[2].get_legend().set_title(None)
    axes[2].tick_params(axis='both', labelsize=12)
    axes[2].legend(fontsize=16, ncol=2)

    axes[2].set_xlabel("genomic distance (bp)", fontsize=16)
    plt.savefig(OUT_FOLDER + f"/{chrom}_{res}_{F_NAME}.png", dpi=600, bbox_inches='tight')
    plt.close()


In [7]:
FILE_PATH = "test_files/4DNFI8ZYY7VT_dekker.mcool::resolutions/10000" # 4DNucleome id
OUT_FOLDER = "plots/normalized_counts_ones"
F_NAME = "HUVEC 3.8 gb" #HFFc6
MAX_GEN_DIST = 200_000_000
QUANTS = [0.25, 0.50, 0.75]

cool_handle = hicona.HiconaCooler(FILE_PATH)
res = cool_handle.info["bin-size"]
max_bin_diff = -(-MAX_GEN_DIST // res)

In [8]:
DONE = []
for chrom in cool_handle.chromnames:
    print(f"Starting to work on {chrom}")
    if chrom in DONE:
        print(f"{chrom} was already processed, skipping")
        continue
    pix_df = get_filt_counts(cool_handle, chrom, max_bin_diff)
    num_pix = count_pixels(pix_df)
    filt_qts = compute_quantiles(pix_df)
    get_norm_counts(pix_df)
    norm_qts = compute_quantiles(pix_df, True)
    create_plot(filt_qts, norm_qts, num_pix)
    del pix_df, filt_qts, norm_qts

Starting to work on chr1
Start 74469506, End 23207710, Lost 0.6883595548492023
(Ones) Start 23207710, End 6394267, Lost 0.7244766071275451
Starting to work on chr2
Start 71225594, End 25930551, Lost 0.6359377360896422
(Ones) Start 25930551, End 7044656, Lost 0.7283260197594721
Starting to work on chr3
Start 55561126, End 20907953, Lost 0.6236945773921141
(Ones) Start 20907953, End 5975153, Lost 0.7142162601953429
Starting to work on chr4
Start 47927214, End 19462252, Lost 0.5939206480894133
(Ones) Start 19462252, End 5910031, Lost 0.6963336514191677
Starting to work on chr5
Start 44101349, End 17676054, Lost 0.5991947094407475
(Ones) Start 17676054, End 5402902, Lost 0.6943377747092196
Starting to work on chr6
Start 41133958, End 17374674, Lost 0.5776075329293622
(Ones) Start 17374674, End 5158851, Lost 0.7030821412821904
Starting to work on chr7
Start 36252003, End 15377125, Lost 0.5758268860344076
(Ones) Start 15377125, End 4748108, Lost 0.6912226440248096
Starting to work on chr8
St