# Computing filtering statistics on different files
**Note**: Not using directly hicona functions to filter counts and compute normalization factors since they are not meant to be in the final public API and therefore are susceptible to changes, which could thus break the script functionality.

In [1]:
from cooler.util import open_hdf5
import cooltools
import matplotlib.pyplot as plt
from numpy import log2, nan
import pandas as pd
import seaborn as sns

import hicona

In [2]:
def get_filt_stats(cool_obj, chrom_name, max_bin_diff):
    """Get chromosome filtered counts according to HiCONA."""
    
    lower, upper = cool_obj.extent(chrom_name)
    with open_hdf5(cool_obj.store, mode="r") as h5_handle:
        h5_grp = h5_handle[cool_obj.root]
        lo_pix = h5_grp["indexes/bin1_offset"][lower]
        hi_pix = h5_grp["indexes/bin1_offset"][upper]
    
    pix_df = cool_obj.pixels()[lo_pix:hi_pix]
    pix_df.drop_duplicates(subset=["bin1_id", "bin2_id"], inplace=True)

    start_size = pix_df.shape[0]
    indexer = pix_df[
        (pix_df["bin2_id"] - pix_df["bin1_id"] > max_bin_diff)
        | (pix_df["bin1_id"] == pix_df["bin2_id"])
        | (pix_df["bin2_id"] >= upper)
    ].index

    return [start_size, start_size - len(indexer), len(indexer)/start_size]


In [3]:
FILES = {
    "HUVEC 3.8 gb": "4DNFIRMZ7QTE_lieberman.mcool",
    "B cells 3.4 gb": "4DNFISA93XFU_casellas.mcool",
    "HFF6c 7.2 gb": "4DNFI8ZYY7VT_dekker.mcool",
}

PATH_TEMPLATE = "test_files/{}::resolutions/10000" # 4DNucleome id
OUT_FOLDER = "plots/filtering_stats"
MAX_GEN_DIST = 200_000_000

In [4]:
for f_name, f_path in FILES.items():

    print(f"Starting to work on file {f_name}")
    cool_handle = hicona.HiconaCooler(PATH_TEMPLATE.format(f_path))
    res = cool_handle.info["bin-size"]
    max_bin_diff = -(-MAX_GEN_DIST // res)
    
    stats_list = [get_filt_stats(cool_handle, chrom, max_bin_diff) for chrom in cool_handle.chromnames]
    stats_list = [stats + [chrom] for stats, chrom in zip(stats_list, cool_handle.chromnames)]
    stats_list = [el + [f_name] for el in stats_list]
    
    file_df = pd.DataFrame(stats_list, columns = ["start_size", "end_size", "loss", "chrom", "file_name"])
    file_df.to_csv(f"{f_name}_filt_stats.csv", index=False)

Starting to work on file HFF6c 7.2 gb
0


KeyboardInterrupt: 

In [4]:
stats_df = pd.concat([pd.read_csv(f"{f_name}_filt_stats.csv") for f_name in FILES])
stats_df

Unnamed: 0,start_size,end_size,loss,chrom,file_name
0,22153020,7338782,0.668723,chr1,HUVEC 3.8 gb
1,20200462,7783523,0.614686,chr2,HUVEC 3.8 gb
2,15935624,6274855,0.606237,chr3,HUVEC 3.8 gb
3,13352614,5675473,0.574954,chr4,HUVEC 3.8 gb
4,12753217,5413440,0.575524,chr5,HUVEC 3.8 gb
...,...,...,...,...,...
19,5812897,4516342,0.223048,chr20,HFF6c 7.2 gb
20,2381729,1850825,0.222907,chr21,HFF6c 7.2 gb
21,2143447,1819168,0.151289,chr22,HFF6c 7.2 gb
22,8876208,8750532,0.014159,chrX,HFF6c 7.2 gb


In [None]:
plt.figure(figsize=(10,4))
ax = sns.pointplot(data=stats_df, x="chrom", y="loss", hue="file_name")
ax.set_xlabel(None)
ax.set_ylabel("Removed fraction", fontsize=12)
ax.legend(title="File", title_fontsize=14)
plt.title("Filtered out pixels", fontsize=16)
plt.xticks(rotation=45)
plt.savefig(OUT_FOLDER + f"/removed_fraction_comparison.png", dpi=600, bbox_inches='tight')

In [12]:
sns.set(style="ticks", rc={"lines.linewidth": 1})
fig, axes = plt.subplots(2,1, sharex=True, figsize=(10,6), gridspec_kw={'height_ratios': [1, 1]})
fig.suptitle("Filtering statistics comparison", fontsize=24, fontweight=10)
fig.tight_layout()

sns.pointplot(data=stats_df, x="chrom", y="end_size", hue="file_name", palette="tab10", ax=axes[0])
axes[0].set_xlabel(None)
axes[0].set_ylabel("Number of pixels remaining", fontsize=12)
axes[0].legend(title="File", title_fontsize=14)

sns.pointplot(data=stats_df, x="chrom", y="loss", hue="file_name", palette="tab10", ax=axes[1])
axes[1].set_xlabel(None)
axes[1].set_ylabel("Removed fraction", fontsize=12)
axes[1].get_legend().remove()
plt.xticks(rotation=45)

plt.savefig(OUT_FOLDER + f"/full_comparison.png", dpi=600, bbox_inches='tight')
plt.close()


Just to quickly display duplicate inconsistency

In [6]:
dummy_df = {
    "HFFc6 7.3 gb": {
        "chr1": 0, "chr2": 0, "chr3": 1454, "chr4": 1294, "chr5": 0, 
        "chr6": 0, "chr7": 0, "chr8": 0, "chr9": 0, "chr10":758,
        "chr11": 564, "chr12": 587, "chr13": 343, "chr14": 0, "chr15": 397, 
        "chr16": 0, "chr17": 0, "chr18": 0, "chr19": 0, "chr20": 213,
        "chr21": 0, "chr22": 0, "chrX": 0, "chrY": 0, 
    },
    "HUVEC 3.8 gb": {
        "chr1": 39, "chr2": 46, "chr3": 21, "chr4": 8, "chr5": 34, 
        "chr6": 45, "chr7": 40, "chr8": 8, "chr9": 0, "chr10":0,
        "chr11": 0, "chr12": 0, "chr13": 0, "chr14": 0, "chr15": 0, 
        "chr16": 0, "chr17": 0, "chr18": 0, "chr19": 0, "chr20": 0,
        "chr21": 0, "chr22": 0, "chrX": 0, "chrY": 0, 
    },
    "B cells 3.4 gb": {
        "chr1": 33, "chr2": 58, "chr3": 3, "chr4": 13, "chr5": 22, 
        "chr6": 42, "chr7": 3, "chr8": 52, "chr9": 0, "chr10":0,
        "chr11": 112, "chr12": 0, "chr13": 0, "chr14": 0, "chr15": 0, 
        "chr16": 0, "chr17": 0, "chr18": 0, "chr19": 0, "chr20": 0,
        "chr21": 0, "chr22": 0, "chrX": 0, "chrY": 0,     
    }
}

In [14]:
dups_df = pd.DataFrame(dummy_df)
dups_df

Unnamed: 0,HFFc6 7.3 gb,HUVEC 3.8 gb,B cells 3.4 gb
chr1,0,39,33
chr2,0,46,58
chr3,1454,21,3
chr4,1294,8,13
chr5,0,34,22
chr6,0,45,42
chr7,0,40,3
chr8,0,8,52
chr9,0,0,0
chr10,758,0,0
