# Preparing Signal Data for a HMM
Using [__pyBedGraph__](https://github.com/TheJacksonLaboratory/pyBedGraph) to ...
- convert raw signal data in a bedGraph into a mean signal value for each bin <-- bin size specified by user

In [2]:
import numpy as np
from pyBedGraph import BedGraph
import pyBigWig
from pathlib import Path

In [3]:
DATA_DIR = Path().resolve().parent.parent / "data"
SIZES_FILE_PATH = DATA_DIR / "hg38.chrom.sizes"

In [None]:
BG_FILE_PATH = DATA_DIR / "CD14-positive monocyte" / "H3K79me2" / "ENCFF676JWS.sorted.bedGraph"
monocyte_bg_k79 = BedGraph(SIZES_FILE_PATH, BG_FILE_PATH, ["chr14"])

In [5]:
import sys
BW_FILE_PATH = DATA_DIR / "CD14-positive monocyte" / "H3K79me2" / "ENCFF676JWS.bigWig"
print(BW_FILE_PATH.is_file())
monocyte_bw_k79 = pyBigWig.open(str(BW_FILE_PATH.absolute()))
print(type(monocyte_bw_k79))
print(sys.getsizeof(monocyte_bw_k79))

True
<class 'pyBigWig.bigWigFile'>
48


In [None]:
BIN_SIZE = 1000
#monocyte_bg_k79.chromosome_map["chr14"]

In [None]:
# monocyte_bg_k79.load_chrom_data("chr1")

In [None]:
monocyte_bg_k79.load_chrom_bins("chr14", 1000)

Just checking numpy element-wise addition

In [None]:
v = np.arange(0, 10)
print(v)
print(v + 1)

In [None]:
starts = np.arange(300000, 2000000, 100*BIN_SIZE, dtype=np.int32)
ends = starts + 10*BIN_SIZE
print("starts:", starts)
print("ends:", ends)

In [None]:
bin_means = monocyte_bg_k79.stats(start_list=starts, end_list=ends, chrom_name="chr1")
bin_means

# bigWig

In [None]:
chrom_sizes = monocyte_bw_k79.chroms()
print(chrom_sizes.items())

In [None]:
print(chrom_sizes["chr1"], " divided into bins of size ", BIN_SIZE, " is ", chrom_sizes["chr1"] / BIN_SIZE, " and leaves the last one with actually only ", chrom_sizes["chr1"] % BIN_SIZE, " bp")
last_bin_rem = chrom_sizes["chr1"] % BIN_SIZE
monocyte_bw_k79.stats("chr1", chrom_sizes["chr1"] - last_bin_rem, chrom_sizes["chr1"]-1, type="sum", exact=True)

Check that when `nBins` argument passed, bins where no data lands just return a 0 value bin instead of no element in the list at all

In [None]:
BIN_SIZE = 1000
n_bins = chrom_sizes["chr1"] // BIN_SIZE + 1
all_means = monocyte_bw_k79.stats("chr1", 0, chrom_sizes["chr1"]-1, nBins=n_bins, type="mean", exact=True)

In [None]:
# \/ should be 0
len(all_means) - all_means.count(None)

In [None]:
all_means[all_means.index(None) - 1:]

In [None]:
monocyte_bw_k79.intervals("chr1", BIN_SIZE * (all_means.index(None)-1), chrom_sizes["chr1"]-1)

First `None` mean at 284 947'th bin $\implies$ really no gaps before 284 947'th bin?

In [None]:
all_means.index(0)

In [None]:
beforeNones_intervs = monocyte_bw_k79.intervals("chr1", 0, BIN_SIZE * all_means.index(None))
last_end = beforeNones_intervs[0][1]
for start, end, val in beforeNones_intervs[1:]:
    if  last_end + 1 -start:
        print("start:", start, "last_end:", last_end)
    last_end = end

In [None]:
print(f"length should be {n_bins}, is {len(all_means)}")

In [None]:
print("so last bin mean should be ", monocyte_bw_k79.stats("chr1", chrom_sizes["chr1"] - last_bin_rem, chrom_sizes["chr1"], type="sum", exact=True)[0] / last_bin_rem)
print("last bin mean is actually ", monocyte_bw_k79.stats("chr1", chrom_sizes["chr1"] - last_bin_rem, chrom_sizes["chr1"], type="mean")[0], exact=True)

In [None]:
np.cumsum([0] + list(chrom_sizes.values()))[:-1]

In [None]:
monocyte_bw_k79.stats("chr1", nBins=int(n_bins), numpy=True)