# Preparing Signal Data for a HMM
Using [__pyBedGraph__](https://github.com/TheJacksonLaboratory/pyBedGraph) to ...
- convert raw signal data in a bedGraph into a mean signal value for each bin <-- bin size specified by user

In [1]:
import numpy as np
from pyBedGraph import BedGraph
from pathlib import Path

In [2]:
DATA_DIR = Path().resolve().parent.parent / "data"
BG_FILE_PATH = DATA_DIR / "CD14-positive monocyte" / "H3K79me2" / "ENCFF676JWS.sorted.bedGraph"
SIZES_FILE_PATH = DATA_DIR / "hg38.chrom.sizes"
monocyte_bg_k79 = BedGraph(SIZES_FILE_PATH, BG_FILE_PATH, ["chr1"])

In [3]:
monocyte_bg_k79.load_chrom_data("chr1")

In [9]:
BIN_SIZE = 1000
monocyte_bg_k79.chromosome_map["chr1"]

<pyBedGraph.Chrom_Data.Chrom_Data at 0x7f849c6c44f0>

In [5]:
# monocyte_bg_k79.load_chrom_bins("chr1", BIN_SIZE)

Just checking numpy element-wise addition

In [6]:
v = np.arange(0, 10)
print(v)
print(v + 1)

[0 1 2 3 4 5 6 7 8 9]
[ 1  2  3  4  5  6  7  8  9 10]


In [15]:
starts = np.arange(300000, 2000000, 100*BIN_SIZE, dtype=np.int32)
ends = starts + 10*BIN_SIZE
print("starts:", starts)
print("ends:", ends)

starts: [ 300000  400000  500000  600000  700000  800000  900000 1000000 1100000
 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000]
ends: [ 310000  410000  510000  610000  710000  810000  910000 1010000 1110000
 1210000 1310000 1410000 1510000 1610000 1710000 1810000 1910000]


In [16]:
bin_means = monocyte_bg_k79.stats(start_list=starts, end_list=ends, chrom_name="chr1")
bin_means

array([0.22082   , 0.22082   , 0.22082   , 0.22082   , 0.22082   ,
       0.22082   , 0.19897   , 0.22082   , 0.16183   , 0.22082   ,
       0.26182728, 8.38443125, 0.22082   , 0.62931   , 0.22082   ,
       6.25424   , 0.62931   ])