**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



## Import data

In [2]:
txt_fdiry = file.path(
    FD_RES, 
    "region_annotation", 
    "fcc_astarr_macs_input_overlap",
    "summary"
)
txt_fname = "matrix.annotation.fcc_peak_call.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)

dat_region_annot_fcc = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 100454     15


Chrom,ChromStart,ChromEnd,Region,ASTARR_A,WSTARR_A,LMPRA_A,TMPRA_A,ASTARR_R,WSTARR_R,LMPRA_R,TMPRA_R,CRISPRi-HCRFF,CRISPRi-Growth,E2G-Benchmark
chr1,10038,10405,chr1:10038-10405,0,0,0,0,1,0,0,0,0,0,0
chr1,16025,16338,chr1:16025-16338,0,0,0,0,1,0,0,0,0,0,0
chr1,17288,17689,chr1:17288-17689,0,0,0,0,1,0,0,0,0,0,0


In [3]:
txt_fdiry = file.path(
    FD_RES, 
    "region_annotation", 
    "fcc_astarr_macs_input_overlap",
    "summary"
)
txt_fname = "region.summary.encode_e2g_benchmark.regulated.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)

dat_region_annot_e2g = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 3035    6


Chrom,ChromStart,ChromEnd,Region,Regulated_Ifany,Regulated_Count
chr1,3774056,3776283,chr1:3774056-3776283,True,3
chr1,3803955,3806146,chr1:3803955-3806146,False,0
chr1,4126841,4128109,chr1:4126841-4128109,False,0


In [5]:
txt_fdiry = file.path(
    FD_RES, 
    "region_nuc", 
    "fcc_astarr_macs",
    "summary"
)
txt_fname = "K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)

dat_region_nuc = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 150042      6


Chrom,ChromStart,ChromEnd,Region,pGC,Length
chr1,10038,10405,chr1:10038-10405,0.523161,367
chr1,14282,14614,chr1:14282-14614,0.578313,332
chr1,16025,16338,chr1:16025-16338,0.587859,313


## ASTARR and GC Content

In [6]:
tmp = dat_region_nuc
tmp = tmp  %>% 
    dplyr::mutate(High_GC = as.integer(pGC > 0.60)) %>%
    dplyr::select(Chrom, ChromStart, ChromEnd, High_GC)

dat = dat_region_annot_fcc
dat = dat %>% dplyr::left_join(tmp, by = c("Chrom", "ChromStart", "ChromEnd"))
dat = dat %>% dplyr::mutate(ASTARR_R_GC = ifelse(High_GC == 1, 0, ASTARR_R))

dat_region_annot_fcc_pgc = dat
head(dat)

Chrom,ChromStart,ChromEnd,Region,ASTARR_A,WSTARR_A,LMPRA_A,TMPRA_A,ASTARR_R,WSTARR_R,LMPRA_R,TMPRA_R,CRISPRi-HCRFF,CRISPRi-Growth,High_GC,ASTARR_R_GC
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
chr1,10038,10405,chr1:10038-10405,0,0,0,0,1,0,0,0,0,0,0,1
chr1,16025,16338,chr1:16025-16338,0,0,0,0,1,0,0,0,0,0,0,1
chr1,17288,17689,chr1:17288-17689,0,0,0,0,1,0,0,0,0,0,1,0
chr1,28934,29499,chr1:28934-29499,0,0,0,0,1,0,0,0,0,0,1,0
chr1,115429,115969,chr1:115429-115969,1,0,0,0,0,0,0,0,0,0,0,0
chr1,136201,137353,chr1:136201-137353,0,0,0,0,1,0,0,0,0,0,1,0


## Enhancer vs Repressor

In [7]:
dat = dat_region_annot_fcc
dat = dat %>% dplyr::select(Region, ends_with("_A"))

dat_region_annot_enh = dat
print(dim(dat))
head(dat)

[1] 100454      5


Region,ASTARR_A,WSTARR_A,LMPRA_A,TMPRA_A
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:10038-10405,0,0,0,0
chr1:16025-16338,0,0,0,0
chr1:17288-17689,0,0,0,0
chr1:28934-29499,0,0,0,0
chr1:115429-115969,1,0,0,0
chr1:136201-137353,0,0,0,0


In [8]:
dat = dat_region_annot_fcc
dat = dat %>% dplyr::select(Region, ends_with("_R"))

dat_region_annot_rep = dat
print(dim(dat))
head(dat)

[1] 100454      5


Region,ASTARR_R,WSTARR_R,LMPRA_R,TMPRA_R
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:10038-10405,1,0,0,0
chr1:16025-16338,1,0,0,0
chr1:17288-17689,1,0,0,0
chr1:28934-29499,1,0,0,0
chr1:115429-115969,0,0,0,0
chr1:136201-137353,1,0,0,0


In [9]:
dat = dat_region_annot_fcc_pgc
dat = dat %>% 
    dplyr::select(Region, ASTARR_R_GC, ends_with("_R")) %>%
    dplyr::select(-ASTARR_R)

dat_region_annot_rep_gcfilter = dat
print(dim(dat))
head(dat)

[1] 100454      5


Region,ASTARR_R_GC,WSTARR_R,LMPRA_R,TMPRA_R
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:10038-10405,1,0,0,0
chr1:16025-16338,1,0,0,0
chr1:17288-17689,0,0,0,0
chr1:28934-29499,0,0,0,0
chr1:115429-115969,0,0,0,0
chr1:136201-137353,0,0,0,0


## CRISPRi E2G

In [10]:
dat = dat_region_annot_e2g
dat = dat %>% 
    dplyr::filter(Regulated_Ifany) %>% 
    dplyr::select(Chrom, ChromStart, ChromEnd, Region) %>% 
    dplyr::mutate(CRISPR_E2G = 1)

dat_region_annot_e2g_arrange = dat
head(dat)

Chrom,ChromStart,ChromEnd,Region,CRISPR_E2G
<chr>,<dbl>,<dbl>,<chr>,<dbl>
chr1,3774056,3776283,chr1:3774056-3776283,1
chr1,8857787,8858608,chr1:8857787-8858608,1
chr1,25932989,25934187,chr1:25932989-25934187,1
chr1,26378069,26379011,chr1:26378069-26379011,1
chr1,28883566,28885491,chr1:28883566-28885491,1
chr1,70674903,70676391,chr1:70674903-70676391,1


In [11]:
dat = dplyr::left_join(
    dat_region_annot_fcc,
    dat_region_annot_e2g_arrange,
    by = c("Chrom", "ChromStart", "ChromEnd", "Region")
)

dat = dat %>% 
    dplyr::select(Region, starts_with("CRISPR")) %>% 
    replace(is.na(.), 0)

dat_region_annot_crispr = dat
print(dim(dat))
head(dat)

[1] 100454      4


Region,CRISPRi-HCRFF,CRISPRi-Growth,CRISPR_E2G
<chr>,<dbl>,<dbl>,<dbl>
chr1:10038-10405,0,0,0
chr1:16025-16338,0,0,0
chr1:17288-17689,0,0,0
chr1:28934-29499,0,0,0
chr1:115429-115969,0,0,0
chr1:136201-137353,0,0,0


## Count assay for each region

In [12]:
lst = list(
    "Enhancer"  = dat_region_annot_enh,
    "Repressor" = dat_region_annot_rep,
    "Repressor_GCFilter" = dat_region_annot_rep_gcfilter,
    "CRISPR"    = dat_region_annot_crispr
)

lst = lapply(lst, function(dat){
    dat = dat %>% tibble::column_to_rownames(var="Region")
    vec = apply(dat, 1, sum)
    
    idx = (vec > 0)
    vec = vec[idx]

    dat = data.frame(
        Region    = names(vec),
        Num_Assay = vec
    )
    return(dat)
})

dat = bind_rows(lst, .id = "Type")
dat = dat %>% tidyr::separate(Region, c("Chrom", "ChromStart", "ChromEnd"), remove = FALSE)
dat = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region, Type, Num_Assay)

dat_region_annot_fcc_vote = dat
print(dim(dat))
head(dat)

[1] 160666      6


Unnamed: 0_level_0,Chrom,ChromStart,ChromEnd,Region,Type,Num_Assay
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
chr1:115429-115969,chr1,115429,115969,chr1:115429-115969,Enhancer,1
chr1:184091-184563,chr1,184091,184563,chr1:184091-184563,Enhancer,1
chr1:605104-605675...3,chr1,605104,605675,chr1:605104-605675,Enhancer,1
chr1:778233-779389,chr1,778233,779389,chr1:778233-779389,Enhancer,3
chr1:818708-819335...5,chr1,818708,819335,chr1:818708-819335,Enhancer,1
chr1:826796-828040...6,chr1,826796,828040,chr1:826796-828040,Enhancer,1


In [13]:
dat = dat_region_annot_fcc_vote
table(dat$Type)


            CRISPR           Enhancer          Repressor Repressor_GCFilter 
              4740              43993              65401              46532 

In [14]:
dat = dat_region_annot_fcc_vote
table(dat$Type, dat$Num_Assay)

                    
                         1     2     3     4
  CRISPR              4692    40     8     0
  Enhancer           27396 12548  4011    38
  Repressor          63652  1744     5     0
  Repressor_GCFilter 45310  1219     3     0

In [23]:
dat = dat_region_annot_fcc_vote
table(dat$Type, dat$Num_Assay)

                    
                         1     2     3     4
  CRISPR              4662    40     8     0
  Enhancer           27403 13617  6609    57
  Repressor          71923  2610     9     0
  Repressor_GCFilter 51591  1599     3     0

## Export results

In [15]:
txt_fdiry = file.path(
    FD_RES, 
    "region_annotation", 
    "fcc_astarr_macs_input_overlap",
    "summary"
)
txt_fname = "region.summary.fcc_peak_call.assayvote.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region_annot_fcc_vote
write_tsv(dat, txt_fpath)