#### Define candidate enhancer regions

In [2]:
%%bash
mkdir -p /data/reddylab/Revathy/dev/Maria/ABC
source /data/reddylab/software/miniconda3/bin/activate jung_py3
module load bedtools2
abc=/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction
abc_dir=/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction
in_dir=/data/reddylab/Jungkyun/MS_maria/results/processing/atac_seq/Ciofani_5484_190307A5_MCS226-233_258_259-pe-blacklist-removal
out_dir=/data/reddylab/Revathy/dev/Maria/ABC
list=list_of_samples.txt
samples=$(cat /data/reddylab/Revathy/dev/Maria/ABC/$list|tr '\n' ' ')
# Need hg38 version of whitelist and gene list in bed format

#hg38_RefSeqCurated.170308.bed.CollapsedGeneBounds.bed           
#hg38_wgEncodeHg19ConsensusSignalArtifactRegions.bed
#hg38_RefSeqCurated.170308.bed.CollapsedGeneBounds.TSS500bp.bed
################################################################################################################
# 1. Count DNase-seq reads in each peak and retain the top N peaks with the most read counts
################################################################################################################

task=candidate_region_by_dnase
mkdir -p /data/reddylab/Revathy/dev/Maria/ABC/${task}
mkdir -p /data/reddylab/Revathy/dev/Maria/ABC/logs
for dnase in ${samples} ;
do
python ${abc}/src/makeCandidateRegions.py \
--narrowPeak ${in_dir}/${dnase}.masked.dedup.sorted_peaks.narrowPeak \
--bam ${in_dir}/${dnase}.masked.dups_marked.bam \
--outDir /data/reddylab/Revathy/dev/Maria/ABC/${task}/ \
--chrom_sizes ${abc_dir}/reference/chr_sizes \
--regions_blacklist /data/reddylab/Revathy/dev/Maria/hg38_wgEncodeHg19ConsensusSignalArtifactRegions.bed \
--regions_whitelist /data/reddylab/Revathy/dev/Maria/hg38_RefSeqCurated.170308.bed.CollapsedGeneBounds.TSS500bp.bed \
--peakExtendFromSummit 250 \
--nStrongestPeaks 150000
done

Running: awk 'FNR==NR {x2[$1] = $0; next} $1 in x2 {print x2[$1]}' /data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction/reference/chr_sizes <(samtools view -H /data/reddylab/Jungkyun/MS_maria/results/processing/atac_seq/Ciofani_5484_190307A5_MCS226-233_258_259-pe-blacklist-removal/Th01_1.masked.dups_marked.bam | grep SQ | cut -f 2 | cut -c 4- )  > /data/reddylab/Revathy/dev/Maria/ABC/candidate_region_by_dnase/Th01_1.masked.dedup.sorted_peaks.narrowPeak.Th01_1.masked.dups_marked.bam.Counts.bed.temp_sort_order
Running: bedtools sort -faidx /data/reddylab/Revathy/dev/Maria/ABC/candidate_region_by_dnase/Th01_1.masked.dedup.sorted_peaks.narrowPeak.Th01_1.masked.dups_marked.bam.Counts.bed.temp_sort_order -i /data/reddylab/Jungkyun/MS_maria/results/processing/atac_seq/Ciofani_5484_190307A5_MCS226-233_258_259-pe-blacklist-removal/Th01_1.masked.dedup.sorted_peaks.narrowPeak | bedtools coverage -g /data/reddylab/Revathy/dev/Maria/ABC/candidate_region_by_dnase/Th01_1.masked.dedup.sor

In [3]:
%%bash
cd /data/reddylab/Revathy/dev/Maria/ABC/data
cp *.bam *.bam.bai /data/reddylab/Jungkyun/MS_maria/results/processing/chip_seq/CIOFANI_5481_190305B1_MCS234-249-se

In [None]:
%%bash
cd /data/reddylab/Revathy/dev/Maria/ABC/data
for file in *.bam ; 
do 
mv "$file" "${file//_input_1/_1}"
mv "$file" "${file//_input_2/_2}"
done

#### Quantify enhancer activity

In [5]:
%%bash
task=enhancer_acitivity
mkdir -p /gpfs/fs1/data/reddylab/Revathy/dev/Maria/ABC/${task}
source /data/reddylab/software/miniconda3/bin/activate jung_py3
module load bedtools2
module load samtools
H3K27ac_dir=/data/reddylab/Revathy/dev/Maria/ABC/data
dnase_dir=/data/reddylab/Jungkyun/MS_maria/results/processing/atac_seq/Ciofani_5484_190307A5_MCS226-233_258_259-pe-blacklist-removal
abc=/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction
abc_dir=/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction
gene_ex_dir=/data/reddylab/Revathy/dev/Maria/ABC/data/rna_seq
LIST=sample_names.txt
SAMPLES=$(cat /data/reddylab/Revathy/dev/Maria/ABC/$LIST|tr '\n' ' ')

for sample in ${SAMPLES};
do

idx=$(echo ${sample}|cut -d "_" -f1)
echo ${idx}
mkdir -p /gpfs/fs1/data/reddylab/Revathy/dev/Maria/ABC/${task}/${idx}_abc

python ${abc}/src/run.neighborhoods.py \
--candidate_enhancer_regions /data/reddylab/Revathy/dev/Maria/ABC/candidate_region_by_dnase/${sample}.masked.dedup.sorted_peaks.narrowPeak.candidateRegions.bed \
--genes ${abc_dir}/reference/RefSeqCurated.170308.bed.CollapsedGeneBounds.bed \
--H3K27ac ${H3K27ac_dir}/${sample}.masked.dedup.sorted.bam \
--DHS ${dnase_dir}/${sample}.masked.dups_marked.bam \
--expression_table /data/reddylab/Revathy/dev/Maria/ABC/data/rna_seq/${idx}.TPM_Expr.txt \
--chrom_sizes ${abc_dir}/reference/chr_sizes \
--ubiquitously_expressed_genes ${abc_dir}/reference/UbiquitouslyExpressedGenesHG19.txt \
--cellType ${idx}_abc \
--outdir /gpfs/fs1/data/reddylab/Revathy/dev/Maria/ABC/${task}/${idx}_abc 

done



Th01
Namespace(ATAC='', DHS='/data/reddylab/Jungkyun/MS_maria/results/processing/atac_seq/Ciofani_5484_190307A5_MCS226-233_258_259-pe-blacklist-removal/Th01_1.masked.dups_marked.bam', H3K27ac='/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam', candidate_enhancer_regions='/data/reddylab/Revathy/dev/Maria/ABC/candidate_region_by_dnase/Th01_1.masked.dedup.sorted_peaks.narrowPeak.candidateRegions.bed', cellType='Th01_abc', chrom_sizes='/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction/reference/chr_sizes', default_accessibility_feature=None, enhancer_class_override=None, expression_table='/data/reddylab/Revathy/dev/Maria/ABC/data/rna_seq/Th01.TPM_Expr.txt', gene_name_annotations='symbol', genes='/data/reddylab/Revathy/dev/Maria/ABC/ABC-Enhancer-Gene-Prediction/reference/RefSeqCurated.170308.bed.CollapsedGeneBounds.bed', genes_for_class_assignment=None, outdir='/gpfs/fs1/data/reddylab/Revathy/dev/Maria/ABC/enhancer_acitivity/Th01_abc', primary_gene_id

[E::idx_find_and_load] Could not retrieve index file for '/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam'
samtools idxstats: fail to load index for "/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam", reverting to slow method
[E::idx_find_and_load] Could not retrieve index file for '/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam'
samtools idxstats: fail to load index for "/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam", reverting to slow method
[E::idx_find_and_load] Could not retrieve index file for '/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam'
samtools idxstats: fail to load index for "/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_1.masked.dedup.sorted.bam", reverting to slow method
[E::idx_find_and_load] Could not retrieve index file for '/data/reddylab/Revathy/dev/Maria/ABC/data/Th01_2.masked.dedup.sorted.bam'
samtools idxstats: fail to load index for "

HiC files are pre-processed using Juicer, can be found in /data/reddylab/Revathy/dev/Maria/HiC_preprocessing.ipynb

#### Compute ABC scores

In [5]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate jung_py3
ABC=/gpfs/fs1/data/reddylab/Revathy/collabs/Maria/ABC/ABC-Enhancer-Gene-Prediction
task=ABC_score
mkdir -p /gpfs/fs1/data/reddylab/Revathy/collabs/Maria/ABC/${task}
Enhancer_activity=/gpfs/fs1/data/reddylab/Revathy/collabs/Maria/ABC/enhancer_acitivity
hic=/gpfs/fs1/data/reddylab/Revathy/collabs/Maria/hic

cd ${Enhancer_activity}

threshold=02
LIST=sample_names.txt
SAMPLES=$(cat /data/reddylab/Revathy/dev/Maria/ABC/$LIST|tr '\n' ' ')

for sample in $(echo Th01 Th17);
do
python ${ABC}/src/predict.py \
--enhancers ${Enhancer_activity}/${sample}_abc/EnhancerList.txt \
--genes ${Enhancer_activity}/${sample}_abc/GeneList.txt \
--HiCdir ${hic}/${sample} \
--hic_resolution 5000 \
--scale_hic_using_powerlaw \
--threshold .${threshold} \
--cellType ${sample} \
--outdir /gpfs/fs1/data/reddylab/Revathy/collabs/Maria/ABC/${task}/${sample}_hic.threshold0.${threshold} \
--make_all_putative 

done


reading genes
reading enhancers
Making predictions for chromosome: chr22
Making putative predictions table...
Using: /gpfs/fs1/data/reddylab/Revathy/dev/Maria/hic/Th01/chr22/chr22.KRobserved.gz
Begin HiC
Loading HiC
hic.to.sparse: Elapsed time: 0.9547169208526611
HiC Matrix has row sums of 1669.5396348412314, making doubly stochastic...
HiC has 1399603 rows after windowing between 0 and 5000000
process.hic: Elapsed time: 2.2869439125061035
HiC added to predictions table. Elapsed time: 0.37697601318359375
HiC Complete
Completed chromosome: chr22. Elapsed time: 4.167795658111572 

Making predictions for chromosome: chr1
Making putative predictions table...
Using: /gpfs/fs1/data/reddylab/Revathy/dev/Maria/hic/Th01/chr1/chr1.KRobserved.gz
Begin HiC
Loading HiC
hic.to.sparse: Elapsed time: 8.680987119674683
HiC Matrix has row sums of 1821.457072176637, making doubly stochastic...
HiC has 9718374 rows after windowing between 0 and 5000000
process.hic: Elapsed time: 18.089518070220947
HiC add

#### Create ABC enhancer beds

In [1]:
%%bash
module load bedtools2
task=enhancers_by_ABC_bed
mkdir -p /data/reddylab/Revathy/dev/Maria/ABC/${task}
in_dir=/data/reddylab/Revathy/dev/Maria/ABC/ABC_score

cd ${in_dir}

for i in `ls -d *`;
do 
cat ${in_dir}/${i}/EnhancerPredictions.txt | \
cut -f 1-3,5,7|grep -v end > \
/data/reddylab/Revathy/dev/Maria/ABC/${task}/${i}.bed
done


In [4]:
%%bash

wc -l /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th01_hic.threshold0.008/EnhancerPredictions.txt
wc -l /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th17_hic.threshold0.008/EnhancerPredictions.txt

598211 /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th01_hic.threshold0.008/EnhancerPredictions.txt
589871 /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th17_hic.threshold0.008/EnhancerPredictions.txt


In [10]:
%%bash

wc -l /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th01_hic.threshold0.02/EnhancerPredictions.txt
wc -l /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th17_hic.threshold0.02/EnhancerPredictions.txt

195312 /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th01_hic.threshold0.02/EnhancerPredictions.txt
203522 /data/reddylab/Revathy/dev/Maria/ABC/ABC_score/Th17_hic.threshold0.02/EnhancerPredictions.txt
