**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [3]:
FOLDER = "annotation_chromHMM"

In [4]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "ChIP_ENCSR000EWA-ENCSR000AKP-ENCSR000EWC-ENCSR000DWB-ENCSR000EWB-ENCSR000APE.bed"
[1] "description.tsv"
[1] "K562.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz"


In [5]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "peak.annotation.ChIP_ENCSR000EWA-ENCSR000AKP-ENCSR000EWC-ENCSR000DWB-ENCSR000EWB-ENCSR000APE.bed.gz"
[1] "peak.annotation.K562.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz"


## Import data

In [6]:
fdiry  = file.path(FD_RES, "results", "region", FOLDER)
fname = "description.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_cnames = dat
print(dim(dat))
dat

[1] 9 2


Name,Description
<chr>,<chr>
Chrom,Chromosome
Start,Start position
End,End position
Name,ChromHMM Label; 15 state ChIP-seq model
Score,Score
Strand,[+-.]; Use '.' if no strand is assigned.
ThickStart,The starting position at which the feature is drawn thickly
ThickEnd,The ending position at which the feature is drawn thickly
ItemRgb,"An RGB value of the form R,G,B (e.g. 255,0,0)"


In [9]:
### init: set column names
cnames = dat_cnames$Name
cnames = c("Chrom_ATAC", "Start_ATAC", "End_ATAC", cnames, "Overlap")

### init: set file path
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
fname = "peak.annotation.K562.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz"
fpath = file.path(fdiry, fname)

### init: annotation and label
annotation = "ChromHMM"

### read data
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
dat = dat %>% dplyr::mutate(
    Peak_ATAC  = paste0(Chrom_ATAC, ":", Start_ATAC, "-", End_ATAC),
    Annotation = annotation,
    Label      = Name,
    Region     = paste0(Chrom, ":", Start, "-", End),
    Value      = Score,
    Note       = Name
)

### assign and show
dat_peak_annot_import = dat
print(dim(dat))
head(dat)

[1] 271850     19


Chrom_ATAC,Start_ATAC,End_ATAC,Chrom,Start,End,Name,Score,Strand,ThickStart,ThickEnd,ItemRgb,Overlap,Peak_ATAC,Annotation,Label,Region,Value,Note
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
chr1,10015,10442,chr1,0,16000,Quies,1,.,0,16000,220220220,427,chr1:10015-10442,ChromHMM,Quies,chr1:0-16000,1,Quies
chr1,14253,14645,chr1,0,16000,Quies,1,.,0,16000,220220220,392,chr1:14253-14645,ChromHMM,Quies,chr1:0-16000,1,Quies
chr1,16015,16477,chr1,16000,16200,TxWk,1,.,16000,16200,6315480,185,chr1:16015-16477,ChromHMM,TxWk,chr1:16000-16200,1,TxWk
chr1,17237,17772,chr1,17400,17600,TxWk,1,.,17400,17600,6315480,200,chr1:17237-17772,ChromHMM,TxWk,chr1:17400-17600,1,TxWk
chr1,28903,29613,chr1,17600,118400,Quies,1,.,17600,118400,220220220,710,chr1:28903-29613,ChromHMM,Quies,chr1:17600-118400,1,Quies
chr1,30803,31072,chr1,17600,118400,Quies,1,.,17600,118400,220220220,269,chr1:30803-31072,ChromHMM,Quies,chr1:17600-118400,1,Quies


## Arrange and summarize

In [10]:
### init
dat = dat_peak_annot_import

### summarize
dat = dat %>%
    dplyr::group_by(Chrom_ATAC, Start_ATAC, End_ATAC, Peak_ATAC, Annotation, Label) %>%
    dplyr::summarise(
        Count  = n(), 
        Region = paste(Region, collapse="|"),
        Score  = mean(Value),
        Note   = paste(Note, collapse="|"),
        .groups = "drop")

## Arrange
dat = dat %>% 
    dplyr::arrange(Chrom_ATAC, Start_ATAC, End_ATAC) %>%
    dplyr::select(-Chrom_ATAC, -Start_ATAC, -End_ATAC) %>%
    dplyr::rename("Peak" = Peak_ATAC)

### assign and show
dat_peak_annot_summary = dat
print(dim(dat))
head(dat)

[1] 262805      7


Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:10015-10442,ChromHMM,Quies,1,chr1:0-16000,1,Quies
chr1:14253-14645,ChromHMM,Quies,1,chr1:0-16000,1,Quies
chr1:16015-16477,ChromHMM,TxWk,1,chr1:16000-16200,1,TxWk
chr1:17237-17772,ChromHMM,TxWk,1,chr1:17400-17600,1,TxWk
chr1:28903-29613,ChromHMM,Quies,1,chr1:17600-118400,1,Quies
chr1:30803-31072,ChromHMM,Quies,1,chr1:17600-118400,1,Quies


## Explore and check results

In [11]:
dat = dat_peak_annot_summary
lst = split(dat, dat$Label)
lst = lapply(lst, function(dat){
    table(dat$Count)
})
lst

$Biv

    1     2     3     4 
12731   119     2     1 

$Enh1

   1    2    3 
8913  152    1 

$Enh2

   1    2    3 
8759  397   13 

$EnhG1

   1    2    3 
1668   29    1 

$EnhG2

   1    2    3 
5997   60    2 

$Het

   1 
2816 

$Quies

    1     2     3 
91225    14     1 

$ReprPC

    1     2     3 
50739    41     2 

$Tss

   1    2    3    4    5    6 
6654 1797  257   21    3    1 

$TssFlnk

   1    2    3 
3308  156    4 

$TssFlnkD

   1    2    3    4 
6137  590   28    1 

$TssFlnkU

   1    2    3    4    5 
7879 3260  476   47    8 

$Tx

    1     2 
19716    22 

$TxWk

    1     2     3     4 
27008   555     7     2 

$`ZNF/Rpts`

   1 
1185 


## Save results

In [12]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "summary")
fname = "peak.summary.chromHMM.tsv"
fpath = file.path(fdiry, fname)

dat = dat_peak_annot_summary
write_tsv(dat, fpath)