**Set environment**

In [1]:
source ../config/config_duke.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



In [2]:
ls -1 ../scripts

[0m[38;5;34mrun_bedtools_annot_A.sh[0m
[38;5;34mrun_bedtools_annot.sh[0m
[38;5;34mrun_bedtools_closest.sh[0m
[38;5;34mrun_bedtools_intersect_inv.sh[0m
[38;5;34mrun_bedtools_intersect.sh[0m
[38;5;34mrun_bedtools_merge_assay.sh[0m
[38;5;34mrun_bedtools_merge_region.sh[0m
[38;5;34mrun_bedtools_test.sh[0m
run_hicstraw_aggregate.py
[38;5;34mrun_hicstraw_aggregate.sh[0m
[38;5;34mrun_sing_proj_encode_fcc.sh[0m
run_test.py


In [3]:
cat > ../scripts/run_hicstraw_aggregate.py << 'EOF'
### Set environment
import numpy as np
import itertools as it
import functools
import re, os, sys, gzip
import argparse
import hicstraw

print = functools.partial(print, flush=True)

### Parse arguments and set global variables

parser = argparse.ArgumentParser()
parser.add_argument('--fpath_hic',     type=str,  required=True)
parser.add_argument('--fpath_inp',     type=str,  required=True)
parser.add_argument('--fpath_out',     type=str,  required=True)
parser.add_argument('--normalization', type=str,  required=True) # description="NONE, VC, VC_SQRT, KR, SCALE, etc")
parser.add_argument('--resolution',    type=int,  required=True)
parser.add_argument('--window_size',   type=int,  required=True)
parser.add_argument('--data_type',     type=str,  required=True) # description="'observed' or 'oe' (observed/expected)")
parser.add_argument('--verbose',       type=bool, default=True)
parser.add_argument('--num_line_show', type=int,  default=1)
parser.add_argument('--num_line_read', type=int,  default=None)

args = parser.parse_args()
TXT_FPATH_HIC     = args.fpath_hic
TXT_FPATH_INP     = args.fpath_inp
TXT_FPATH_OUT     = args.fpath_out

TXT_NORMALIZATION = args.normalization
NUM_RESOLUTION    = args.resolution
NUM_WINDOW_SIZE   = args.window_size
TXT_DATA_TYPE     = args.data_type

VERBOSE       = args.verbose
NUM_LINE_SHOW = args.num_line_show
NUM_LINE_READ = args.num_line_read

TXT_CHROM1 = None
TXT_CHROM2 = None
LST_MATRIX = []

### show global variables
if VERBOSE:
    print("===== Run script =====")
    print("Script: run_hicstraw_aggregate.py")
    print("")
    
    print("==== Arguments ====")
    print("Hi-C   file:   ", TXT_FPATH_HIC)
    print("Input  file:   ", TXT_FPATH_INP)
    print("Output file:   ", TXT_FPATH_OUT)
    print("")
    
    print("Normalization: ", TXT_NORMALIZATION)
    print("Data type:     ", TXT_DATA_TYPE)
    print("Resolution:    ", NUM_RESOLUTION)
    print("Window size:   ", NUM_WINDOW_SIZE)
    print("#Lines show:   ", NUM_LINE_SHOW)
    print("#Lines read:   ", NUM_LINE_READ)
    print("")
    
### Import data: Hi-C
hic = hicstraw.HiCFile(TXT_FPATH_HIC)

### read lines of input file
with open(TXT_FPATH_INP,'r') as finp:

    ### set read lines
    lines_inp = it.islice(finp, NUM_LINE_READ)
    lines_inp = finp
    
    ### get header
    cnames = lines_inp.readline()
    
    ### progress each line
    for line_idx, line_inp in enumerate(lines_inp):
        
        ### parse info
        line_inp = line_inp.strip()
        
        lst = re.split(":|-|\t", line_inp)
        txt_chrom1, num_start1, num_end1 = lst[0:3]
        txt_chrom2, num_start2, num_end2 = lst[3:6]
        txt_name = lst[6]
        
        ### update hic matrix object
        if (TXT_CHROM1 != txt_chrom1) or (TXT_CHROM2 != txt_chrom2):
            TXT_CHROM1 = txt_chrom1
            TXT_CHROM2 = txt_chrom2
            
            mat_object = hic.getMatrixZoomData(
                TXT_CHROM1, 
                TXT_CHROM2, 
                TXT_DATA_TYPE,
                TXT_NORMALIZATION, 
                "BP", 
                NUM_RESOLUTION)
        
        ### show progress
        if VERBOSE:
            if (line_idx % NUM_LINE_SHOW) == 0:
                print("Process Line:", line_idx)
                print(txt_chrom1, num_start1, num_end1)
                print(txt_chrom2, num_start2, num_end2)
                print("")
                
        ### get hic matrix
        mat_numpy = mat_object.getRecordsAsMatrix(
            int(num_start1) - NUM_WINDOW_SIZE,
            int(num_start1) + NUM_WINDOW_SIZE,
            int(num_start2) - NUM_WINDOW_SIZE,
            int(num_start2) + NUM_WINDOW_SIZE
        )
        
        ### collect hic matrix
        LST_MATRIX.append(mat_numpy)

### 
with open(TXT_FPATH_OUT, 'wb') as fout:
    arr = np.array(LST_MATRIX)
    np.save(fout, arr)

if VERBOSE:
    print("")
    print("==== Save results ====")
    print("Array shape:", arr.shape)
    print("")

EOF

In [4]:
cat > ../scripts/run_hicstraw_aggregate.sh << 'EOF'
#!/bin/bash

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set I/O
source ../config/config_duke.sh
FP_HIC=$1
FP_INP=$2
FP_OUT=$3

TXT_NORMALIZATION=$4 # 'RU'
NUM_RESOLUTION=$5    # 100
NUM_WINDOW_SIZE=$6   # 10,000
TXT_DATA_TYPE=oe

### show I/O file
echo "Input: " ${FP_INP}
echo
echo "show first few lines of input"
fun_head ${FP_INP}
echo

### execute
FD_RUN=${FD_PRJ}/notebooks/scripts
FP_RUN=${FD_RUN}/run_sing_proj_encode_fcc.sh

${FP_RUN} python ${FD_RUN}/run_hicstraw_aggregate.py \
    --fpath_hic     ${FP_HIC} \
    --fpath_inp     ${FP_INP} \
    --fpath_out     ${FP_OUT} \
    --normalization ${TXT_NORMALIZATION} \
    --resolution    ${NUM_RESOLUTION}    \
    --window_size   ${NUM_WINDOW_SIZE}   \
    --data_type     ${TXT_DATA_TYPE}
    
### show output file
echo
echo "Output: " ${FP_OUT}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

chmod +x ../scripts/run_hicstraw_aggregate.sh

In [5]:
FOLDER=hic_intact_K562_deep
FP_HIC=${FD_RES}/source/hic_intact_K562_deep/inter.hic
echo ${FP_HIC}

/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/inter.hic


In [6]:
ls ${FD_RES}/results/hic_intact_K562_deep/coverage_aggregate/

[0m[38;5;27maggregate_crispri_hcrff_window10kb_bin200[0m
[38;5;27maggregate_crispri_hcrff_window20kb_bin100[0m
[38;5;27maggregate_crispri_hcrff_window20kb_bin200[0m
[38;5;27maggregate_crispri_hcrff_window20kb_bin500[0m
[38;5;27maggregate_crispri_hcrff_window50kb_bin500[0m
location_pair.TSS.crispri_hcrff.bedpe
location_pair.TSS.crispri_hcrff.chr11.bedpe
location_pair.TSS.crispri_hcrff.chr12.bedpe
location_pair.TSS.crispri_hcrff.chr4.bedpe
location_pair.TSS.crispri_hcrff.chr5.bedpe
location_pair.TSS.crispri_hcrff.chr6.bedpe
location_pair.TSS.crispri_hcrff.chr8.bedpe
location_pair.TSS.crispri_hcrff.chrX.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr11.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr12.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr4.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr5.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr6.bedpe
location_pair.TSS.crispri_hcrff.filter_1kb.chr8.bedpe


In [7]:
for NUM_CHROMOSOME in 4 5 6 8 11 12 X; do
    TXT_CHROMOSOME=chr${NUM_CHROMOSOME}
    echo ${NUM_CHROMOSOME}
    echo ${TXT_CHROMOSOME}
done

4
chr4
5
chr5
6
chr6
8
chr8
11
chr11
12
chr12
X
chrX


In [8]:
FD_RUN=${FD_PRJ}/notebooks/scripts
FP_RUN=${FD_RUN}/run_hicstraw_aggregate.sh

FOLDER=hic_intact_K562_deep
FP_HIC=${FD_RES}/source/${FOLDER}/inter.hic
FD_INP=${FD_RES}/results/${FOLDER}/coverage_aggregate
FD_OUT=${FD_RES}/results/${FOLDER}/coverage_aggregate/aggregate_crispri_hcrff_window20kb_bin200

TXT_NORMALIZATION="RU"
NUM_RESOLUTION=200
NUM_WINDOW_SIZE=20000

for NUM_CHROMOSOME in 4 5 6 8 11 12 X; do
    ###
    TXT_CHROMOSOME=chr${NUM_CHROMOSOME}
    echo ${TXT_CHROMOSOME}
    
    ###
    FN_INP=location_pair.TSS.crispri_hcrff.${TXT_CHROMOSOME}.bedpe
    FN_OUT=location_pair.TSS.crispri_hcrff.${TXT_CHROMOSOME}.npy
    FN_LOG=hic.aggregate.TSS.crispri_hcrff.${TXT_CHROMOSOME}.txt
    
    FP_INP=${FD_INP}/${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    FP_LOG=${FD_LOG}/${FN_LOG}
    
    ###
    sbatch -p ${NODE} \
        --exclude dl-01 \
        --cpus-per-task 8 \
        --mem 30G \
        --output ${FP_LOG} \
        --array 0 \
        ${FP_RUN} \
            ${FP_HIC} ${FP_INP} ${FP_OUT} \
            ${TXT_NORMALIZATION} \
            ${NUM_RESOLUTION}    \
            ${NUM_WINDOW_SIZE}
    echo
done

#${FP_RUN} ${FP_HIC} ${FP_INP} ${FP_OUT} "chr11" "RU" 500 10000

chr4
Submitted batch job 30458639

chr5
Submitted batch job 30458640

chr6
Submitted batch job 30458641

chr8
Submitted batch job 30458642

chr11
Submitted batch job 30458643

chr12
Submitted batch job 30458644

chrX
Submitted batch job 30458645



In [10]:
echo ${FP_LOG}
echo
cat  ${FP_LOG}

/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/hic.aggregate.TSS.crispri_hcrff.chrX.txt

Hostname:           x1-01-3.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         11-29-23+09:46:40

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverage_aggregate/location_pair.TSS.crispri_hcrff.chrX.bedpe

show first few lines of input
#Chrom_Peak	Start_Peak	End_Peak	Chrom_TSS	Start_TSS	End_TSS	Name	Score
chrX	48783047	48783048	chrX	48786589	48786590	chrX:48782597-48783497|GATA1	2.231263465
chrX	48786547	48786548	chrX	48786589	48786590	chrX:48786097-48786997|GATA1	3.708137016
chrX	48786647	48786648	chrX	48802066	48802067	chrX:48786297-48786997|HDAC6	-1.425316268
chrX	48800747	48800748	chrX	48786589	48786590	chrX:48800197-48801297|GATA1	2.398983455
chrX	48800697	48800698	chrX	48802066	48802067	chrX:48800297-48801097|HDAC6	1.396661655
chrX	48802347	48802348	chrX	48802066	48802067	chrX:48801697-48802997|HDAC6	2.587553091

===== Run script =====

In [13]:
FD_RUN=${FD_PRJ}/notebooks/scripts
FP_RUN=${FD_RUN}/run_hicstraw_aggregate.sh

FOLDER=hic_intact_K562_deep
FP_HIC=${FD_RES}/source/${FOLDER}/inter.hic
FD_INP=${FD_RES}/results/${FOLDER}/coverage_aggregate
FD_OUT=${FD_RES}/results/${FOLDER}/coverage_aggregate/aggregate_crispri_hcrff_window20kb_bin200_filter1kb

TXT_NORMALIZATION="RU"
NUM_RESOLUTION=200
NUM_WINDOW_SIZE=20000

for NUM_CHROMOSOME in 4 5 6 8 11 12 X; do
    ###
    TXT_CHROMOSOME=chr${NUM_CHROMOSOME}
    echo ${TXT_CHROMOSOME}
    
    ###
    FN_INP=location_pair.TSS.crispri_hcrff.filter1kb.${TXT_CHROMOSOME}.bedpe
    FN_OUT=location_pair.TSS.crispri_hcrff.filter1kb.${TXT_CHROMOSOME}.npy
    FN_LOG=hic.aggregate.TSS.crispri_hcrff.filter1kb.${TXT_CHROMOSOME}.txt
    
    FP_INP=${FD_INP}/${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    FP_LOG=${FD_LOG}/${FN_LOG}
    
    ###
    sbatch -p ${NODE} \
        --exclude dl-01 \
        --cpus-per-task 8 \
        --mem 30G \
        --output ${FP_LOG} \
        --array 0 \
        ${FP_RUN} \
            ${FP_HIC} ${FP_INP} ${FP_OUT} \
            ${TXT_NORMALIZATION} \
            ${NUM_RESOLUTION}    \
            ${NUM_WINDOW_SIZE}
    echo
done

#${FP_RUN} ${FP_HIC} ${FP_INP} ${FP_OUT} "chr11" "RU" 500 10000

chr4
Submitted batch job 30458653

chr5
Submitted batch job 30458654

chr6
Submitted batch job 30458655

chr8
Submitted batch job 30458656

chr11
Submitted batch job 30458657

chr12
Submitted batch job 30458658

chrX
Submitted batch job 30458659



In [14]:
echo ${FP_LOG}
echo
cat  ${FP_LOG}

/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/hic.aggregate.TSS.crispri_hcrff.filter1kb.chrX.txt

Hostname:           x1-01-3.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         11-29-23+09:50:36

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/hic_intact_K562_deep/coverage_aggregate/location_pair.TSS.crispri_hcrff.filter1kb.chrX.bedpe

show first few lines of input
#Chrom_Peak	Start_Peak	End_Peak	Chrom_TSS	Start_TSS	End_TSS	Name	Score
chrX	48783047	48783048	chrX	48786589	48786590	chrX:48782597-48783497|GATA1	2.231263465
chrX	48786647	48786648	chrX	48802066	48802067	chrX:48786297-48786997|HDAC6	-1.425316268
chrX	48800747	48800748	chrX	48786589	48786590	chrX:48800197-48801297|GATA1	2.398983455
chrX	48800697	48800698	chrX	48802066	48802067	chrX:48800297-48801097|HDAC6	1.396661655

===== Run script =====
Script: run_hicstraw_aggregate.py

==== Arguments ====
Hi-C   file:    /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/source/hic_intact_K562_deep/int