In [1]:
source ../config_duke.sh -v

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect



In [7]:
cat > motif_marginal_count.py << 'EOF'
###################################################
# Set environment
###################################################

### set environment
import argparse
import sys
sys.path.append("/home/mount/project")

###################################################
# Import packages and parse arguments
###################################################

### import packages and paths
from config_sing import *
import multiprocessing
from functools import reduce

### parse argument from command line
parser = argparse.ArgumentParser(description='Get count table from annotated fragment.')

parser.add_argument('-t', '--target',  
                    type = str,  
                    help = 'Target region')

parser.add_argument("-c", "--core", 
                    type    = int,
                    default = 1,
                    help    = "Number of cores (Default: 1)")

parser.add_argument("-o", "--outdir", 
                    type = str,
                    help = "Output file directory")
                    
parser.add_argument("-s", "--score", 
                    type    = float,
                    default = 0,
                    help    = "Threshold of motif score")

parser.add_argument("-v", "--verbose", 
                    action = "store_true",
                    help   = "Set output verbosity")

###################################################
# Set global variables
###################################################

### global variables from argument
args = parser.parse_args()
TARGET          = args.target
FD_OUT          = os.path.join(FD_RES, 'model_linear', args.outdir, args.target)
N_CORE          = args.core
THRESHOLD_MOTIF = args.score
VERBOSE         = args.verbose

### create output directory
os.makedirs(FD_OUT, exist_ok=True)

### samples
SAMPLES = np.concatenate([
    reduce(np.char.add, ["Input", np.arange(1,6).astype(str)        ]),
    reduce(np.char.add, ["Input", np.arange(1,6).astype(str), "_20x"]),
    reduce(np.char.add, ["TFX",   np.arange(2,6).astype(str), "_DMSO"]),
    reduce(np.char.add, ["TFX",   np.arange(2,6).astype(str), "_Dex"])
])

### all motif file names
sam    = "Input1_20x"
fdiry  = os.path.join(FD_RES, "annotation_fragment", sam, TARGET)
fname  = "*_merge.bed.gz"
fpath  = os.path.join(fdiry, fname)
MOTIFS = np.sort([os.path.basename(fp) for fp in glob(fpath)])

### PRINT
print("Target:                  ", TARGET)
print("Outdir:                  ", FD_OUT)
print("N_Core:                  ", N_CORE)
print("Threshold of motif score:", THRESHOLD_MOTIF)
print("Is Verbose:              ", VERBOSE)

###################################################
# Set global variables
###################################################

def get_count_table(motif, verbose = True):
    """import annotated fragment and generate count table"""
    
    ### INIT
    mtf = motif.replace("_merge.bed.gz", "")
    is_created = False

    ### for each sample, import annotated fragment and generate count table
    for sam in SAMPLES:    
        
        ###################################################
        ### Import & filtering fragment annotation
        ###################################################
        
        ### set input file path
        fdiry = os.path.join(FD_RES, "annotation_fragment", sam, TARGET)
        fname = motif
        fpath = os.path.join(fdiry, fname)
        if verbose:
            print(mtf, sam, "Import", fpath, flush=True)
        
        ### INIT
        lst = []
        idx = -1 # HANDLE EXCEPTION: Empty file
        
        ### import data
        with gzip.open(fpath, 'rb') as file:
            for idx, line in enumerate(file):
                
                ### preprocess each line 
                line = str(line, 'utf-8')
                line = line.strip().split("\t")
                
                ### extract needed values
                idx1, idx2, idx3, idx4 = 5, 6, 8, 9
                len_mtf = int(line[idx2]) - int(line[idx1])
                len_lap = int(line[idx4])
                score   = float(line[idx3])
                
                ### filtering motifs scores and make sure
                ### fragments cover the full motif
                if (len_mtf <= len_lap) & (score >= THRESHOLD_MOTIF):
                    lst.append(line)
        
        if verbose:
            print(mtf, sam, "Filter", str(idx + 1) + "-" + str(len(lst)), flush=True) 
            
        ### HANDLE EXCEPTION: Empty data
        if len(lst) == 0:
            if verbose:
                print(mtf, sam, "Skip Empty", flush=True) 
            continue
        
        ###################################################
        ### summarize the annotation and create count table
        ###################################################
        
        ### wrap up a list of lines into a dataframe
        CNAMES = ["Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
                  "Chrom_MTF",  "Start_MTF",  "End_MTF",
                  "Motif",      "Score",      "Overlap"]
        CTYPES = [np.str, np.int,   np.int, np.int,
                  np.str, np.int,   np.int,
                  np.str, np.float, np.int]
        dat = pd.DataFrame(lst, columns=CNAMES) \
            .astype(dict(zip(CNAMES, CTYPES))) \
            .assign(Sample = sam)
        
        ### count the number of motifs for each fragment
        dat = dat  \
            .groupby(["Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Motif", "Sample"]) \
            .size() \
            .reset_index(name='N_Motif')
        
        ### Summarize into count table
        dat = dat \
            .groupby(["Sample", "Motif", "N_Motif"]) \
            .agg(Value=('Count_Frag', sum)) \
            .reset_index()
        
        if verbose:
            print(mtf, sam, "Count", np.sum(dat.Value), flush=True) 
        
        ###################################################
        ### Store the count table
        ###################################################
        
        ### SET: file path for output count table
        fdiry = FD_OUT 
        fname = "count_" + mtf + ".tsv"
        fpath = os.path.join(fdiry, fname)

        ### store results
        ### create the table for the first sample or if the file is not yet created
        ### Otherwise, append the counts in the file
        if is_created:

            ### PRINT: file path for output count
            print(mtf, sam, "Store_Append", fpath, flush=True)

            ### append the file
            dat.to_csv(fpath, sep='\t', index=False, mode='a', header=False)
            
        else:

            ### PRINT: file path for output count
            print(mtf, sam, "Store_Create", fpath, flush=True)
            
            ### create the file
            dat.to_csv(fpath, sep='\t', index=False, mode='w')

            ### update flag
            is_created = True
    

###################################################
# Get motif count table for marginal analysis
###################################################
print("\n++++++++++ Start Analysis ++++++++++")

### INIT: timer
tic = time.time()

### get countable table and store
pool = multiprocessing.Pool(N_CORE)
res  = pool.map(get_count_table, MOTIFS)

### PRINT: end message
toc = time.time()
print("Script Done!")
print("Time Elapse:" , str(timedelta(seconds = toc - tic)))

EOF

In [8]:
### set log file directory
sbatch -p ${NODE} \
    --mem=8G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count PER1 f00' \
    -o ${FD_LOG}/test.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=target_PER1
#FDIRY=marginal_filter00
FDIRY=example
NCORE=10
THRESHOLD_MOTIF=0

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
#srun ${FD_PRJ}/sing_proj_combeffect.sh python -c "print('In python')"
srun ${FD_PRJ}/sing_proj_combeffect.sh python motif_marginal_count.py \
    -t ${TARGET} -o ${FDIRY} -s ${THRESHOLD_MOTIF} -c ${NCORE} --verbose
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26651261


compare efficiency
* Rscript

```
${FD_LOG}/linear_model_marginal_count_per1_filter00.txt 
Timer of the loop:
Time difference of 3.560382 mins

Done!
Run Time: 3 minutes and 37 seconds


Timer of the loop:
Time difference of 1.032754 mins

Done!
Run Time: 1 minutes and 6 seconds
```

* Python

```
Target:                   target_PER1
Outdir:                   /home/mount/work/out/proj_combeffect/model_linear/example/target_PER1
N_Core:                   1
Threshold of motif score: 0.0
Is Verbose:               True

> Script Done!
> Time Elapse: 0:03:09.384503 
$ Done!
$ Run Time: 3 minutes and 12 seconds

Target:                   target_PER1
Outdir:                   /home/mount/work/out/proj_combeffect/model_linear/example/target_PER1
N_Core:                   10
Threshold of motif score: 0.0
Is Verbose:               True

> Script Done!
> Time Elapse: 0:00:28.825956
$ Done!
$ Run Time: 31 seconds
```

In [9]:
head -20 ${FD_LOG}/test.txt

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect

Slurm Array Index: 
Time Stamp:         10-24-21+16:19:22

Target:                   target_PER1
Outdir:                   /home/mount/work/out/proj_combeffect/model_linear/example/target_PER1
N_Core:                   10
Threshold of motif score: 0.0
Is Verbose:               True

++++++++++ Start Analysis ++++++++++
AHR Input1 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1/target_PER1/AHR_merge.bed.gz
BCL6_1 Input1 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1/target_PER1/BCL6_1_merge.bed.gz


In [4]:
tail ${FD_LOG}/test.txt

ZSCAN4 TFX4_Dex Store_Append /home/mount/work/out/proj_combeffect/model_linear/example/target_PER1/count_ZSCAN4.tsv
ZSCAN4 TFX5_Dex Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX5_Dex/target_PER1/ZSCAN4_merge.bed.gz
ZSCAN4 TFX5_Dex Filter 195-178
ZSCAN4 TFX5_Dex Count 204
ZSCAN4 TFX5_Dex Store_Append /home/mount/work/out/proj_combeffect/model_linear/example/target_PER1/count_ZSCAN4.tsv
Script Done!
Time Elapse: 0:03:09.384503 

Done!
Run Time: 3 minutes and 12 seconds


## Test Chromosome 22

In [10]:
### set log file directory
sbatch -p ${NODE} \
    --mem=20G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count chr22 f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chr22_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chr22
#FDIRY=marginal_filter00
FDIRY=example
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
#srun ${FD_PRJ}/sing_proj_combeffect.sh python -c "print('In python')"
srun ${FD_PRJ}/sing_proj_combeffect.sh python motif_marginal_count.py \
    -t ${TARGET} -o ${FDIRY} -s ${THRESHOLD_MOTIF} -c ${NCORE} --verbose
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26651263


## Test Chromosome X

In [11]:
### set log file directory
sbatch -p ${NODE} \
    --mem=50G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count chrX f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chrX_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chrX
#FDIRY=marginal_filter95
FDIRY=example
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
#srun ${FD_PRJ}/sing_proj_combeffect.sh python -c "print('In python')"
srun ${FD_PRJ}/sing_proj_combeffect.sh python motif_marginal_count.py \
    -t ${TARGET} -o ${FDIRY} -s ${THRESHOLD_MOTIF} -c ${NCORE} --verbose
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26651267


```
Target:                   chrX
Outdir:                   /home/mount/work/out/proj_combeffect/model_linear/example/chrX
N_Core:                   10
Threshold of motif score: 10.81
Is Verbose:               True

> Timer of the loop:
> Time difference of 1.077125 hours
$ Done!
$ Run Time: 1 hours 4 minutes and 44 seconds
```

## Run all chromsomes

In [14]:
### set log file directory
sbatch -p ${NODE} \
    --array=0-23 \
    --mem=50G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count chrom f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chrom_filter95.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
CHROMS=($(seq 1 22) X Y)
CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}

TARGET=${CHROM}
FDIRY=marginal_filter95
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo "Chromosome:        " ${CHROM}
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh python motif_marginal_count.py \
    -t ${TARGET} -o ${FDIRY} -s ${THRESHOLD_MOTIF} -c ${NCORE} --verbose
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26651294


In [16]:
tail ${FD_LOG}/linear_model_marginal_count_chrom_filter95.*.txt

==> /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log/linear_model_marginal_count_chrom_filter95.0.txt <==
ZNF382 TFX4_Dex Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr1/count_ZNF382.tsv
ZNF382 TFX5_Dex Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX5_Dex/chr1/ZNF382_merge.bed.gz
ZNF382 TFX5_Dex Filter 961383-36684
ZNF382 TFX5_Dex Count 44254
ZNF382 TFX5_Dex Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr1/count_ZNF382.tsv
Script Done!
Time Elapse: 4:26:50.615116

Done!
Run Time: 4 hours 26 minutes and 55 seconds

==> /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log/linear_model_marginal_count_chrom_filter95.10.txt <==
ZNF382 TFX4_Dex Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr11/count_ZNF382.tsv
ZNF382 TFX5_Dex Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX5_Dex/chr11/ZNF382_merge.bed.gz
ZNF382 TFX5_Dex Filter 584795-20796