In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /mount/work 
PATH OF SOURCE:     /mount/work/source 
PATH OF EXECUTABLE: /mount/work/exe 
PATH OF ANNOTATION: /mount/work/annotation 
PATH OF PROJECT:    /mount/project 
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc 


## Annotation: cCREs

In [2]:
### set file path
fdiry = file.path(FD_RES, "region", "KS91_K562_astarrseq_peak_macs")
fname = "KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.ccre_all.bed.gz"
fpath = file.path(fdiry, fname)

### import data
cnames = c("Chrom", "Start", "End",
           "Chrom_CCRE", "Start_CCRE", "End_CCRE", "Name", "Score", "Strand", 
           "Value1", "Value2", "Value3", "Label1", "Label2",
           "Overlap")
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
dat = dat %>% mutate(
    Peak = paste(Chrom, Start, End, sep="_"),
    Len  = End - Start)

### assign
dat_ccre = dat
print(dim(dat))
head(dat)

[1] 165907     17


Chrom,Start,End,Chrom_CCRE,Start_CCRE,End_CCRE,Name,Score,Strand,Value1,Value2,Value3,Label1,Label2,Overlap,Peak,Len
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>
chr1,180982,182087,chr1,181251,181601,EH38E1310153,0,.,181251,181601,6218147,DNase-only,All-data/Full-classification,350,chr1_180982_182087,1105
chr1,777949,779437,chr1,778562,778912,EH38E1310158,0,.,778562,778912,25500,"PLS,CTCF-bound",All-data/Full-classification,350,chr1_777949_779437,1488
chr1,777949,779437,chr1,779086,779355,EH38E1310159,0,.,779086,779355,25500,PLS,All-data/Full-classification,269,chr1_777949_779437,1488
chr1,816774,817547,chr1,817080,817403,EH38E1310166,0,.,817080,817403,225225225,Low-DNase,All-data/Full-classification,323,chr1_816774_817547,773
chr1,818602,819380,chr1,818718,818872,EH38E1310168,0,.,818718,818872,225225225,Low-DNase,All-data/Full-classification,154,chr1_818602_819380,778
chr1,819732,820291,chr1,819893,820227,EH38E1310170,0,.,819893,820227,225225225,Low-DNase,All-data/Full-classification,334,chr1_819732_820291,559


## Annotation: ReMap

In [3]:
### set file path
fdiry = file.path(FD_RES, "region", "KS91_K562_astarrseq_peak_macs")
fname = "KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.remap2022_nr.bed.gz"
fpath = file.path(fdiry, fname)

### import data
cnames = c("Chrom", "Start", "End",
           "Chrom_ReMap", "Start_ReMap", "End_ReMap", "Name", "Score", "Strand", 
           "Value1", "Value2", "Value3",
           "Overlap")
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
dat = dat %>% mutate(
    Peak = paste(Chrom, Start, End, sep="_"),
    Len  = End - Start)

### assign
dat_remap = dat
print(dim(dat))
head(dat)

[1] 5520842      15


Chrom,Start,End,Chrom_ReMap,Start_ReMap,End_ReMap,Name,Score,Strand,Value1,Value2,Value3,Overlap,Peak,Len
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
chr1,10015,10442,chr1,10016,10229,ZBTB5:K-562,1,.,10108,10109,215958,213,chr1_10015_10442,427
chr1,136071,137429,chr1,136239,136897,ZBTB33:K-562,1,.,136734,136735,252224252,658,chr1_136071_137429,1358
chr1,136071,137429,chr1,136277,137062,EGR1:K-562,2,.,136757,136758,56224140,785,chr1_136071_137429,1358
chr1,136071,137429,chr1,136281,137167,ZNF639:K-562,1,.,136754,136755,224112112,886,chr1_136071_137429,1358
chr1,136071,137429,chr1,136332,136968,TRIM24:K-562,2,.,136803,136804,25256252,636,chr1_136071_137429,1358
chr1,136071,137429,chr1,136340,137098,NR2C1:K-562,2,.,136788,136789,4125089,758,chr1_136071_137429,1358


## Normalize counts to frequencies

In [6]:
###
dat = dat_ccre
dat = dat %>% 
    dplyr::rename(Label = Label1) %>%
    group_by(Peak, Chrom, Start, End, Len, Label) %>% 
    summarize(Count = n(), .groups="drop")

###
dat = dat %>%
    group_by(Peak) %>%
    mutate(Total = sum(Count)) %>%
    mutate(Freq  = Count / Total)

###
dat_freq_ccre = dat
print(dim(dat))
head(dat)

[1] 120179      9


Peak,Chrom,Start,End,Len,Label,Count,Total,Freq
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<int>,<dbl>
chr1_100027983_100029702,chr1,100027983,100029702,1719,Low-DNase,1,1,1.0
chr1_100036871_100039191,chr1,100036871,100039191,2320,Low-DNase,2,6,0.3333333
chr1_100036871_100039191,chr1,100036871,100039191,2320,pELS,3,6,0.5
chr1_100036871_100039191,chr1,100036871,100039191,2320,PLS,1,6,0.1666667
chr1_100048234_100048985,chr1,100048234,100048985,751,Low-DNase,1,1,1.0
chr1_100049562_100050540,chr1,100049562,100050540,978,Low-DNase,1,1,1.0


In [7]:
###
dat = dat_remap
dat = dat %>% 
    group_by(Peak, Chrom, Start, End, Len, Name) %>% 
    summarize(Count = n(), .groups="drop")

###
dat = dat %>%
    group_by(Peak) %>%
    mutate(Total = sum(Count)) %>%
    mutate(Freq  = Count / Total)

###
dat_freq_remap = dat
print(dim(dat))
head(dat)

[1] 5162105       9


Peak,Chrom,Start,End,Len,Name,Count,Total,Freq
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<int>,<dbl>
chr1_100006256_100006880,chr1,100006256,100006880,624,TAF15:K-562,1,1,1.0
chr1_10002087_10003910,chr1,10002087,10003910,1823,CBX2:K-562,1,6,0.1666667
chr1_10002087_10003910,chr1,10002087,10003910,1823,CBX8:K-562,1,6,0.1666667
chr1_10002087_10003910,chr1,10002087,10003910,1823,"CEBPB:MCF-7,K-562,Hep-G2",1,6,0.1666667
chr1_10002087_10003910,chr1,10002087,10003910,1823,ETS1:K-562,1,6,0.1666667
chr1_10002087_10003910,chr1,10002087,10003910,1823,HINFP:K-562,1,6,0.1666667


## Store the table

In [8]:
fdiry = file.path(FD_RES, "region", "KS91_K562_astarrseq_peak_macs")
fname = "KS91_K562_hg38_ASTARRseq_Input.ccre_all.freq.tsv"
fpath = file.path(fdiry, fname)

write_tsv(dat_freq_ccre, fpath)

In [9]:
fdiry = file.path(FD_RES, "region", "KS91_K562_astarrseq_peak_macs")
fname = "KS91_K562_hg38_ASTARRseq_Input.remap2022_nr.tsv"
fpath = file.path(fdiry, fname)

write_tsv(dat_freq_remap, fpath)