**Set environment**

In [1]:
source ../config/config_duke.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



## Set script

In [2]:
cat > run_converage_region_pair_score_subset.sh << 'EOF'
#!/bin/bash

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set I/O
source ../config/config_duke.sh
FP_HIC=$1
FP_INP=$2
FP_OUT=$3

TXT_CHROMOSOME=$4      # chr11
TXT_NORMALIZATION=$5   # 'RU'
VAL_RESOLUTION=$6      # 100
VAL_NUM_LINE_SHOW=$7   # 10_000

### show I/O file
echo "Input: " ${FP_INP}
echo
echo "show first few lines of input"
zcat ${FP_INP} | head -5
echo

### execute
RUN_SING=${FD_PRJ}/notebooks/sing_proj_encode_fcc.sh
${RUN_SING} python run_coverage_region_pair_score_subset.py \
    --fpath_hic ${FP_HIC} \
    --fpath_inp ${FP_INP} \
    --fpath_out ${FP_OUT} \
    --chromosome    ${TXT_CHROMOSOME}    \
    --normalization ${TXT_NORMALIZATION} \
    --resolution    ${VAL_RESOLUTION} \
    --num_line_show ${VAL_NUM_LINE_SHOW}

### show output file
echo
echo "Output: " ${FP_OUT}
echo
echo "show first few lines of output:"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

chmod +x run_converage_region_pair_score_subset.sh

In [3]:
cat > run_coverage_region_pair_score_subset.py << 'EOF'
### Set environment
import numpy as np
import itertools as it
import functools
import os, sys, gzip
import argparse
import hicstraw
import re

print = functools.partial(print, flush=True)

### Parse arguments and set global variables
parser = argparse.ArgumentParser()
parser.add_argument('--fpath_hic',     type=str, required=True)
parser.add_argument('--fpath_inp',     type=str, required=True)
parser.add_argument('--fpath_out',     type=str, required=True)
parser.add_argument('--normalization', type=str, required=True)
parser.add_argument('--resolution',    type=int, required=True)
parser.add_argument('--chromosome',    type=str, required=True)
parser.add_argument('--num_line_show', type=int, required=True)

args = parser.parse_args()
FP_HIC=args.fpath_hic
FP_INP=args.fpath_inp
FP_OUT=args.fpath_out

TXT_CHROMOSOME    = args.chromosome
TXT_NORMALIZATION = args.normalization
VAL_RESOLUTION    = args.resolution
VAL_NUM_LINE_SHOW = args.num_line_show

### Import data: Hi-C
hic = hicstraw.HiCFile(FP_HIC)
mat_object = hic.getMatrixZoomData(
    TXT_CHROMOSOME, 
    TXT_CHROMOSOME, 
    "observed", 
    TXT_NORMALIZATION, 
    "BP", 
    VAL_RESOLUTION)

### Read data and write results to output file
with gzip.open(FP_INP,'rt') as finp, gzip.open(FP_OUT,'wt') as fout:
    
    ### set read lines
    lines_inp = finp
    #lines_inp = it.islice(lines_inp, 20) # this line is for testing
    
    ### progress each line
    for line_idx, line_inp in enumerate(lines_inp):
    
        ### parse info
        line_inp = line_inp.strip()
        txt_chrom1, val_start1, val_end1, txt_chrom2, val_start2, val_end2 = re.split(":|-|\t", line_inp)
        
        ### show progress
        if (line_idx % VAL_NUM_LINE_SHOW) == 0:
            print("Process Line:", line_idx)
        
        ### get hic matrix
        mat_numpy = mat_object.getRecordsAsMatrix(
            int(val_start1),
            int(val_end1),
            int(val_start2),
            int(val_end2)
        )
        
        ### get nonzero submatrix of hic
        idx_nonzero  = np.nonzero(mat_numpy)
        mat_nonzero  = mat_numpy[idx_nonzero]
        is_not_empty = np.any(mat_nonzero)
        
        ### calculate descriptive stats on nonzero values
        if(is_not_empty):
            ### calculate values
            lst = [
                np.mean(mat_nonzero),
                np.std(mat_nonzero),
                np.quantile(mat_nonzero, 0.25),
                np.quantile(mat_nonzero, 0.5),
                np.quantile(mat_nonzero, 0.75),
                np.min(mat_nonzero),
                np.max(mat_nonzero),
                mat_nonzero.flatten().shape[0]/mat_numpy.flatten().shape[0]
            ]
            ### round values
            fun = lambda x: np.round(x, decimals = 5)
            lst = list(map(fun, lst))
            
            ### concatenate results into line for output file
            txt_scores = "\t".join(map(str, lst))
            txt_name   = "|".join( map(str, line_inp.split()))
            txt_loc    = "\t".join([
                txt_chrom1, val_start1, val_end1, 
                txt_chrom2, val_start2, val_end2])
            
            line_out = "\t".join([txt_loc, txt_name, txt_scores])
            line_out = line_out + "\n"
            fout.write(line_out)
EOF

## Execute

**Test loops**

In [4]:
TXT_CHROMOSOME=chrX

FOLDER_LOOP=hic_intact_K562_deep
FOLDER_REG=region_pair_${TXT_CHROMOSOME}

FP_HIC=${FD_RES}/source/${FOLDER_LOOP}/inter.hic
FD_INP=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/${FOLDER_REG}
FD_OUT=${FD_RES}/results/${FOLDER_LOOP}/coverage_astarrseq_peak_macs_input/${FOLDER_REG}

FP_INPS=($(ls ${FD_INP}/region_pair*chr*.SUBSET*.tsv.gz))
for FP_INP in ${FP_INPS[@]}; do

    ### set I/O file
    FN_INP=$(basename ${FP_INP})
    IFS='.' read -r -a ARRAY <<< ${FN_INP}
    FN_OUT=${FN_INP}
    FN_LOG=coverage.${FOLDER_LOOP}.region_pair.${ARRAY[1]}.${ARRAY[2]}.%a.txt
    
    FP_INP=${FD_INP}/${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    
    echo "Chromosome:" ${TXT_CHROMOSOME}
    echo
    echo "FP_HIC:"  ${FP_HIC}
    echo "FP_INP:"  ${FP_INP}
    echo "FP_OUT:"  ${FP_OUT}
    echo
    echo "FN_INP:"  ${FN_INP}
    echo "FN_OUT:"  ${FN_OUT}
    echo "FP_LOG:" '${FD_LOG}'/${FN_LOG/\%a/0}
    echo
done

Chromosome: chrX

FP_HIC: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/inter.hic
FP_INP: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair_chrX/region_pair.chrX.SUBSET1.tsv.gz
FP_OUT: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverage_astarrseq_peak_macs_input/region_pair_chrX/region_pair.chrX.SUBSET1.tsv.gz

FN_INP: region_pair.chrX.SUBSET1.tsv.gz
FN_OUT: region_pair.chrX.SUBSET1.tsv.gz
FP_LOG: ${FD_LOG}/coverage.hic_intact_K562_deep.region_pair.chrX.SUBSET1.0.txt

Chromosome: chrX

FP_HIC: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/inter.hic
FP_INP: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair_chrX/region_pair.chrX.SUBSET2.tsv.gz
FP_OUT: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverag

**Execute**

In [5]:
### init: choosing regions
TXT_CHROMOSOME=chrX
FOLDER_LOOP=hic_intact_K562_deep
FOLDER_REG=region_pair_${TXT_CHROMOSOME}

### init: set I/O file directory
FP_HIC=${FD_RES}/source/${FOLDER_LOOP}/inter.hic
FD_INP=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/${FOLDER_REG}
FD_OUT=${FD_RES}/results/${FOLDER_LOOP}/coverage_astarrseq_peak_macs_input/${FOLDER_REG}

### init: arguments
TXT_NORMALIZATION='RU'
VAL_RESOLUTION=100
VAL_NUM_LINE_SHOW=10_000

### execute: get input and set output file directory
mkdir -p ${FD_OUT} 
FP_INPS=($(ls ${FD_INP}/region_pair*chr*.SUBSET*.tsv.gz))

### execute: loop and process each input file
for FP_INP in ${FP_INPS[@]}; do

    ### set I/O file
    FN_INP=$(basename ${FP_INP})
    IFS='.' read -r -a ARRAY <<< ${FN_INP}
    FN_OUT=${FN_INP}
    FN_LOG=coverage.${FOLDER_LOOP}.region_pair.${ARRAY[1]}.${ARRAY[2]}.%a.txt
    
    FP_INP=${FD_INP}/${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    
    echo "Chromosome:" ${TXT_CHROMOSOME}
    echo
    echo "FP_HIC:"  ${FP_HIC}
    echo "FP_INP:"  ${FP_INP}
    echo "FP_OUT:"  ${FP_OUT}
    echo
    echo "FN_INP:"  ${FN_INP}
    echo "FN_OUT:"  ${FN_OUT}
    echo "FP_LOG:" '${FD_LOG}'/${FN_LOG/\%a/0}
    echo
    sbatch -p ${NODE} \
        --exclude dl-01 \
        --cpus-per-task 8 \
        --mem 30G \
        --output ${FD_LOG}/${FN_LOG} \
        --array 0 \
        run_converage_region_pair_score_subset.sh \
            ${FP_HIC} \
            ${FP_INP} \
            ${FP_OUT} \
            ${TXT_CHROMOSOME} \
            ${TXT_NORMALIZATION} \
            ${VAL_RESOLUTION} \
            ${VAL_NUM_LINE_SHOW}
    echo
done

Chromosome: chrX

FP_HIC: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/inter.hic
FP_INP: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair_chrX/region_pair.chrX.SUBSET1.tsv.gz
FP_OUT: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverage_astarrseq_peak_macs_input/region_pair_chrX/region_pair.chrX.SUBSET1.tsv.gz

FN_INP: region_pair.chrX.SUBSET1.tsv.gz
FN_OUT: region_pair.chrX.SUBSET1.tsv.gz
FP_LOG: ${FD_LOG}/coverage.hic_intact_K562_deep.region_pair.chrX.SUBSET1.0.txt

Submitted batch job 30033397

Chromosome: chrX

FP_HIC: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/inter.hic
FP_INP: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair_chrX/region_pair.chrX.SUBSET2.tsv.gz
FP_OUT: /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/result

**Check results**

In [6]:
cat ${FD_LOG}/coverage.hic_intact_K562_deep.region_pair.chrX.SUBSET1.0.txt

Hostname:           x1-03-2.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         07-03-23+11:43:24

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair_chrX/region_pair.chrX.SUBSET1.tsv.gz

show first few lines of input
chrX:10014456-10015536	chrX:10014456-10015536
chrX:10014456-10015536	chrX:10047006-10047716
chrX:10014456-10015536	chrX:100649726-100650005
chrX:10014456-10015536	chrX:100670968-100671799
chrX:10014456-10015536	chrX:100674743-100675253

Process Line: 0


In [55]:
cat ${FD_LOG}/coverage.hic_intact_K562_deep.region_pair.chr2.0.txt

Hostname:           x1-03-1.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         06-30-23+12:58:21

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/region_pair/region_pair.chr2.tsv.gz

show first few lines of input
chr2:100047054-100047450	chr2:100047054-100047450
chr2:100047054-100047450	chr2:100104939-100105313
chr2:100047054-100047450	chr2:100177831-100178695
chr2:100047054-100047450	chr2:100179842-100180130
chr2:100047054-100047450	chr2:100197644-100198321

Process Line: 0
Process Line: 10
Process Line: 20

Output:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverage_astarrseq_peak_macs_input/region_pair.chr2.tsv.gz

show first few lines of output:
chr2	100047054	100047450	chr2	100047054	100047450	chr2:100047054-100047450|chr2:100047054-100047450	13.416	5.405	8.732	13.341	15.215	5.83	23.01	0.64
chr2	100047054	100047450	chr2	100177831	100178695	chr2:100047054-100047450|chr2:100177