**Set environment**

In [1]:
import numpy as np
import itertools as it
import os, sys
import gzip
import hicstraw
import re

sys.path.append('..')
from config.config_sing import *
show_env()

You are in Singularity: singularity_proj_encode_fcc
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



In [2]:
import functools
print = functools.partial(print, flush=True)

**Check data**

In [3]:
fdiry = os.path.join(FD_RES, "results", "region", "KS91_K562_ASTARRseq_peak_macs_input", "region_pair")
os.listdir(fdiry)

['region_pair_chr12',
 'region_pair_chr3',
 'region_pair.chr3.tsv.gz',
 'region_pair.chr2.tsv.gz',
 'region_pair.chr7.tsv.gz',
 'region_pair.chr22.tsv.gz',
 'region_pair.SUBSET.bedpe.gz',
 'region_pair_chr11',
 'region_pair.chrX.tsv.gz',
 'region_pair_chr5',
 '.ipynb_checkpoints',
 'region_pair.chr12.tsv.gz',
 'region_pair.chr18.tsv.gz',
 'region_pair_chr2',
 'region_pair.chr21.tsv.gz',
 'region_pair.chr4.tsv.gz',
 'region_pair.chr8.tsv.gz',
 'region_pair.chr9.tsv.gz',
 'region_pair.chr20.tsv.gz',
 'region_pair.chr6.tsv.gz',
 'region_pair.chr5.tsv.gz',
 'region_pair_chr4',
 'region_pair.chr13.tsv.gz',
 'region_pair_chr20',
 'region_pair_chr16',
 'region_pair.chr19.tsv.gz',
 'region_pair.chr1.tsv.gz',
 'region_pair_chrX',
 'region_pair.chr14.tsv.gz',
 'region_pair_chr19',
 'region_pair.chr16.tsv.gz',
 'region_pair.chr11.tsv.gz',
 'region_pair.chr10.tsv.gz',
 'region_pair_selected',
 'region_pair.chr15.tsv.gz',
 'region_pair_chr8',
 'region_pair_chr6',
 'region_pair.chr17.tsv.gz']

## Import data

**Read HiC data**

In [4]:
fdiry = os.path.join(FD_RES, "source", "hic_intact_K562_deep")
fname = "inter.hic"
fpath = os.path.join(fdiry, fname)
hic   = hicstraw.HiCFile(fpath)

print(hic)
print(hic.getGenomeID())
print(hic.getResolutions())

<hicstraw.HiCFile object at 0x7f75ec4d9df0>
hg38
[2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10, 1]


**Read region pairs**

In [5]:
fdiry = os.path.join(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "region_pair")
fname = 'region_pair.chr11.tsv.gz'
fpath = os.path.join(fdiry, fname)

with gzip.open(fpath,'rt') as finp:
    lines = it.islice(finp, 5)
    for line in lines:
        line = line.strip()
        print(line)

chr11:1002205-1002620	chr11:1002205-1002620
chr11:1002205-1002620	chr11:100285883-100286264
chr11:1002205-1002620	chr11:100451963-100452380
chr11:1002205-1002620	chr11:100547023-100547250
chr11:1002205-1002620	chr11:100563051-100563718


In [6]:
line = 'chr11:1002205-1002620\tchr11:100563051-100563718'
re.split(":|-|\t", line)

['chr11', '1002205', '1002620', 'chr11', '100563051', '100563718']

## Test run

In [7]:
fdiry = os.path.join(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "region_pair")
fname = 'region_pair.chr11.tsv.gz'
fpath = os.path.join(fdiry, fname)

with gzip.open(fpath,'rt') as finp:
    lines = it.islice(finp, 5)
    for line in lines:
        line = line.strip()
        txt_chrom1, val_start1, val_end1, txt_chrom2, val_start2, val_end2 = re.split(":|-|\t", line)
        print(line)
        print(f"Chrom 1:  {txt_chrom1}")
        print(f"Chrom 2:  {txt_chrom2}")
        print(f"Length 1: {int(val_end1) - int(val_start1)}")
        print(f"Length 2: {int(val_end2) - int(val_start2)}")
        print()

chr11:1002205-1002620	chr11:1002205-1002620
Chrom 1:  chr11
Chrom 2:  chr11
Length 1: 415
Length 2: 415

chr11:1002205-1002620	chr11:100285883-100286264
Chrom 1:  chr11
Chrom 2:  chr11
Length 1: 415
Length 2: 381

chr11:1002205-1002620	chr11:100451963-100452380
Chrom 1:  chr11
Chrom 2:  chr11
Length 1: 415
Length 2: 417

chr11:1002205-1002620	chr11:100547023-100547250
Chrom 1:  chr11
Chrom 2:  chr11
Length 1: 415
Length 2: 227

chr11:1002205-1002620	chr11:100563051-100563718
Chrom 1:  chr11
Chrom 2:  chr11
Length 1: 415
Length 2: 667



In [8]:
fdiry = os.path.join(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "region_pair")
fname = 'region_pair.chr11.tsv.gz'
fpath = os.path.join(fdiry, fname)

txt_normalization = "RU"
val_resolution    = 100
txt_chrom         = "chr11"

matrix_object = hic.getMatrixZoomData(
    txt_chrom, 
    txt_chrom, 
    "observed", 
    txt_normalization, 
    "BP", 
    val_resolution)

with gzip.open(fpath,'rt') as finp:
    lines = it.islice(finp, 5)
    for line in lines:
        line = line.strip()
        txt_chrom1, val_start1, val_end1, txt_chrom2, val_start2, val_end2 = re.split(":|-|\t", line)
        print(line)
        print(f"Length 1: {int(val_end1) - int(val_start1)}")
        print(f"Length 2: {int(val_end2) - int(val_start2)}")
        print()
        
        matrix_numpy = matrix_object.getRecordsAsMatrix(
            int(val_start1),
            int(val_end1),
            int(val_start2),
            int(val_end2)
        )
        idx = np.nonzero(matrix_numpy)
        mat = matrix_numpy[idx]
        is_not_empty = np.any(mat)
        
        print("Matrix shape:", matrix_numpy.shape)
        print("Matrix:")
        print(matrix_numpy)
        print()
        print("Index of Nonzeros:")
        print(idx)
        print("Is not empty: ", is_not_empty)
        print()
        if(is_not_empty):
            lst = [
                np.mean(mat),
                np.std(mat),
                np.quantile(mat, 0.25),
                np.quantile(mat, 0.5),
                np.quantile(mat, 0.75),
                np.min(mat),
                np.max(mat),
                mat.flatten().shape[0]/matrix_numpy.flatten().shape[0]
            ]
            fun = lambda x: np.round(x, decimals = 3)
            lst = list(map(fun, lst))
            print(lst)
            print()

chr11:1002205-1002620	chr11:1002205-1002620
Length 1: 415
Length 2: 415

Matrix shape: (5, 5)
Matrix:
[[ 0.          0.          0.          0.          0.        ]
 [ 0.         26.44706535 29.87776566 20.80835724  9.71024513]
 [ 0.         29.87776566 27.55387497 54.10480881 20.8949604 ]
 [ 0.         20.80835724 54.10480881 72.76383209 61.84721375]
 [ 0.          9.71024513 20.8949604  61.84721375 46.56577682]]

Index of Nonzeros:
(array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4]), array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]))
Is not empty:  True

[35.489, 19.346, 20.873, 28.716, 54.105, 9.71, 72.764, 0.64]

chr11:1002205-1002620	chr11:100285883-100286264
Length 1: 415
Length 2: 381

Matrix shape: (1, 1)
Matrix:
[[0.]]

Index of Nonzeros:
(array([], dtype=int64), array([], dtype=int64))
Is not empty:  False

chr11:1002205-1002620	chr11:100451963-100452380
Length 1: 415
Length 2: 417

Matrix shape: (1, 1)
Matrix:
[[0.]]

Index of Nonzeros:
(array([], dtype=int64), ar