# Preparing Signal Data for a HMM
Using [__pyBedGraph__](https://github.com/TheJacksonLaboratory/pyBedGraph) to ...
- convert raw signal data in a bedGraph into a mean signal value for each bin <-- bin size specified by user

In [8]:
import numpy as np
from pyBedGraph import BedGraph
import pyBigWig
from pathlib import Path

In [9]:
DATA_DIR = Path().resolve().parent.parent / "data"
SIZES_FILE_PATH = DATA_DIR / "hg38.chrom.sizes"

In [12]:
BG_FILE_PATH = DATA_DIR / "CD14-positive monocyte" / "H3K79me2" / "ENCFF676JWS.sorted.bedGraph"
monocyte_bg_k79 = BedGraph(SIZES_FILE_PATH, BG_FILE_PATH, ["chr14"])

KeyboardInterrupt: 

In [13]:
BW_FILE_PATH = DATA_DIR / "CD14-positive monocyte" / "H3K79me2" / "ENCFF676JWS.bigWig"
print(BW_FILE_PATH.is_file())
monocyte_bw_k79 = pyBigWig.open(str(BW_FILE_PATH.absolute()))
print(type(monocyte_bw_k79))

True
<class 'pyBigWig.bigWigFile'>


In [15]:
BIN_SIZE = 1000
#monocyte_bg_k79.chromosome_map["chr14"]

In [None]:
# monocyte_bg_k79.load_chrom_data("chr1")

In [None]:
monocyte_bg_k79.load_chrom_bins("chr14", 1000)

KeyError: 'chr14'

Just checking numpy element-wise addition

In [16]:
v = np.arange(0, 10)
print(v)
print(v + 1)

[0 1 2 3 4 5 6 7 8 9]
[ 1  2  3  4  5  6  7  8  9 10]


In [None]:
starts = np.arange(300000, 2000000, 100*BIN_SIZE, dtype=np.int32)
ends = starts + 10*BIN_SIZE
print("starts:", starts)
print("ends:", ends)

starts: [ 300000  400000  500000  600000  700000  800000  900000 1000000 1100000
 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000]
ends: [ 310000  410000  510000  610000  710000  810000  910000 1010000 1110000
 1210000 1310000 1410000 1510000 1610000 1710000 1810000 1910000]


In [None]:
bin_means = monocyte_bg_k79.stats(start_list=starts, end_list=ends, chrom_name="chr1")
bin_means

array([0.22082   , 0.22082   , 0.22082   , 0.22082   , 0.22082   ,
       0.22082   , 0.19897   , 0.22082   , 0.16183   , 0.22082   ,
       0.26182728, 8.38443125, 0.22082   , 0.62931   , 0.22082   ,
       6.25424   , 0.62931   ])

In [17]:
chrom_sizes = monocyte_bw_k79.chroms()
n_bins = chrom_sizes["chr1"] / 1000

In [18]:
np.cumsum([0] + list(chrom_sizes.values()))[:-1]

array([         0,  248956422,  382753844,  517840466,  517940782,
        651216091,  765580419,  872624137,  872825846,  873017315,
        873228488,  873422538,  873460653,  873500208,  873673018,
        873716757,  975707946,  976156194, 1066494539, 1068367298,
       1151624739, 1151810330, 1152091169, 1152203720, 1232577005,
       1291194621, 1291369676, 1291401708, 1291529390, 1291596250,
       1291636426, 1291678636, 1291854679, 1291895424, 1291937141,
       1534130670, 1598574837, 1645284820, 1696103288, 1696254042,
       1696295585, 1696475357, 1696640407, 1696683218, 1696865138,
       1696968976, 1697068351, 1697142336, 1697303807, 1697457606,
       1895753165, 1895908562, 2086123117, 2086332826, 2267871085,
       2267963774, 2438769753, 2598115726, 2743254362, 2881649079,
       2881689141, 2881727195, 2881904040, 2881943090, 2881959659,
       2882142555, 2882306794, 2882444512, 2882621120, 2882782267,
       2882961465, 2883123267, 2883302960, 2883304432, 2883305

In [None]:
monocyte_bw_k79.stats("chr1", nBins=int(n_bins), numpy=True)

array([0.22081999, 0.22081999, 0.22081999, ...,        nan,        nan,
              nan])

# Filling in a Tensor

In [None]:
import torch
big_tab = torch.empty((10, 3))
big_tab

  from .autonotebook import tqdm as notebook_tqdm


tensor([[ 2.8885e+23,  4.5804e-41, -3.1665e+19],
        [ 1.5085e+14,  4.2978e+23,  4.5804e-41],
        [ 2.8885e+23,  4.5804e-41, -2.6754e-17],
        [ 2.8527e-39,  4.2978e+23,  4.5804e-41],
        [ 2.8886e+23,  4.5804e-41,  4.2604e+23],
        [ 1.1916e+27,  4.4829e+23,  4.5804e-41],
        [ 2.8886e+23,  4.5804e-41,  1.3579e+16],
        [-1.8933e-13,  4.2979e+23,  4.5804e-41],
        [ 2.8886e+23,  4.5804e-41,  5.7234e-12],
        [ 2.1678e-33,  4.2979e+23,  4.5804e-41]])

In [None]:
big_tab.shape

torch.Size([10, 3])

In [None]:
big_tab.shape[0]

10

In [None]:
for i in range(big_tab.shape[0]):
    big_tab[i] = torch.tensor([i+1, i, i-1])
print(big_tab)

tensor([[ 1.,  0., -1.],
        [ 2.,  1.,  0.],
        [ 3.,  2.,  1.],
        [ 4.,  3.,  2.],
        [ 5.,  4.,  3.],
        [ 6.,  5.,  4.],
        [ 7.,  6.,  5.],
        [ 8.,  7.,  6.],
        [ 9.,  8.,  7.],
        [10.,  9.,  8.]])
