**Set environment**

In [1]:
suppressWarnings(suppressMessages(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data files**

In [2]:
fdiry = file.path(FD_RES, "results", "region", "annotation_fcc_table")
for (txt in dir(fdiry)){cat(txt, "\n")}

annotation_ccres 
annotation_ccres_silencer 
annotation_chromHMM 
annotation_tss_pol2 
description.tsv 
fcc_table.starrmpra.crispri.atac.concat.bed.gz 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.concat.bed.gz 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.e2g_prediction.concat.bed.gz 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.e2g_prediction.merge.bed.gz 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.e2g_prediction.merge.tsv 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.merge.bed.gz 
fcc_table.starrmpra.crispri.atac.e2g_benchmark.merge.tsv 
fcc_table.starrmpra.crispri.atac.merge.bed.gz 
fcc_table.starrmpra.crispri.atac.merge.tsv 
summary 
z_summary 


## Import data

In [3]:
fdiry = file.path(FD_RES, "results", "region", "annotation_fcc_table")
fname = "fcc_table.starrmpra.crispri.atac.e2g_benchmark.merge.bed.gz"
fpath = file.path(fdiry, fname)

cnames = c("Chrom", "Start", "End", "Assay_Type", "Assay_Label")
dat = read_tsv(fpath, col_names=cnames, show_col_types=FALSE)
dat = dat %>% 
    dplyr::mutate(Region = paste0(Chrom, ":", Start, "-", End)) %>%
    dplyr::select(Chrom, Start, End, Region, Assay_Type, Assay_Label)

dat_region_merge_import = dat
print(dim(dat))
head(dat)

[1] 348966      6


Chrom,Start,End,Region,Assay_Type,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,10015,10442,chr1:10015-10442,ATAC,ATAC
chr1,14253,14645,chr1:14253-14645,ATAC,ATAC
chr1,16015,16477,chr1:16015-16477,ATAC,ATAC
chr1,17237,17772,chr1:17237-17772,ATAC,ATAC
chr1,28903,29613,chr1:28903-29613,ATAC,ATAC
chr1,30803,31072,chr1:30803-31072,ATAC,ATAC


**Check**

In [4]:
dat = dat_region_merge_import
txt = "ASTARR"
dat = dat %>% dplyr::filter(Assay_Type == txt)
table(dat$Assay_Label)


          ASTARR_A ASTARR_A,ASTARR_AB  ASTARR_A,ASTARR_R           ASTARR_R 
               335                 45                 38                726 

In [5]:
dat = dat_region_merge_import
txt = "ATAC"
dat = dat %>% dplyr::filter(Assay_Type == txt)
table(dat$Assay_Label)


  ATAC 
183366 

## Arrange and summary

In [9]:
dat = dat_region_merge_import
dat = dat %>% dplyr::filter(Assay_Label != "ATAC")
dat = dat %>% tidyr::separate_longer_delim(Assay_Type, ",")

dat_region_merge_arrange = dat
print(dim(dat))
head(dat)

[1] 263825      6


Chrom,Start,End,Region,Assay_Type,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,136071,137429,chr1:136071-137429,ASTARR,"ASTARR_R,ATAC"
chr1,136071,137429,chr1:136071-137429,ATAC,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,ASTARR,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,ATAC,"ASTARR_R,ATAC"
chr1,183239,184602,chr1:183239-184602,ATAC,"ATAC,WSTARR_A"
chr1,183239,184602,chr1:183239-184602,WSTARR,"ATAC,WSTARR_A"


## Split by assay type

In [12]:
dat = dat_region_merge_arrange
lst = split(dat$Region, dat$Assay_Type)

lst_region_assay_type = lst
lst = lapply(lst, length)
print(lst)

$ASTARR
[1] 19647

$ATAC
[1] 64076

$`CRISPRi-Growth`
[1] 6171

$`CRISPRi-HCRFF`
[1] 74

$`ENCODE-E2G_Benchmark`
[1] 384

$LMPRA
[1] 38993

$TMPRA
[1] 5686

$WSTARR
[1] 128794



## Split by assay group

In [29]:
dat = dat_region_merge_arrange
dat = dat %>% 
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "CRISPRi"),
            "CRISPR",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "STARR"),
            "STARR",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "MPRA"),
            "MPRA",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "ENCODE-E2G"),
            "CRISPR",
            Assay_Type
        )
    )

dat = dat %>% dplyr::distinct()

dat_region_merge_summary = dat
print(dim(dat))
head(dat)

[1] 252019      6


Chrom,Start,End,Region,Assay_Type,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,136071,137429,chr1:136071-137429,STARR,"ASTARR_R,ATAC"
chr1,136071,137429,chr1:136071-137429,ATAC,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,STARR,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,ATAC,"ASTARR_R,ATAC"
chr1,183239,184602,chr1:183239-184602,ATAC,"ATAC,WSTARR_A"
chr1,183239,184602,chr1:183239-184602,STARR,"ATAC,WSTARR_A"


In [30]:
dat = dat_region_merge_summary
lst = split(dat$Region, dat$Assay_Type)

lst_region_assay_group1 = lst
lst = lapply(lst, length)
print(lst)

$ATAC
[1] 64076

$CRISPR
[1] 6572

$MPRA
[1] 44039

$STARR
[1] 137332



In [31]:
dat = dat_region_merge_arrange
dat = dat %>% 
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "CRISPRi"),
            "CRISPR",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "STARR"),
            "STARR/MPRA",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "MPRA"),
            "STARR/MPRA",
            Assay_Type
        )
    ) %>%
    dplyr::mutate(
        Assay_Type = ifelse(
            str_detect(Assay_Type, "ENCODE-E2G"),
            "CRISPR",
            Assay_Type
        )
    )

dat = dat %>% dplyr::distinct()

dat_region_merge_summary = dat
print(dim(dat))
head(dat)

[1] 233225      6


Chrom,Start,End,Region,Assay_Type,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,136071,137429,chr1:136071-137429,STARR/MPRA,"ASTARR_R,ATAC"
chr1,136071,137429,chr1:136071-137429,ATAC,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,STARR/MPRA,"ASTARR_R,ATAC"
chr1,180982,182087,chr1:180982-182087,ATAC,"ASTARR_R,ATAC"
chr1,183239,184602,chr1:183239-184602,ATAC,"ATAC,WSTARR_A"
chr1,183239,184602,chr1:183239-184602,STARR/MPRA,"ATAC,WSTARR_A"


In [32]:
dat = dat_region_merge_summary
lst = split(dat$Region, dat$Assay_Type)

lst_region_assay_group2 = lst
lst = lapply(lst, length)
print(lst)

$ATAC
[1] 64076

$CRISPR
[1] 6572

$`STARR/MPRA`
[1] 162577



## Question 01

```
For all our identified peaks in CRISPRi-HCR FlowFISH
- X % are supported by STARR/MPRA
- Y % are supported by CRISPRi-Growth
```

In [33]:
lst = lst_region_assay_type
txt = "CRISPRi-HCRFF"
vec = lst[[txt]]

vec_txt_region_query = vec
print(length(vec))
print(vec)

[1] 74
 [1] "chr11:5248847-5249047"     "chr11:5249748-5250880"    
 [3] "chr11:5253147-5253547"     "chr11:5253647-5254647"    
 [5] "chr11:5269147-5271147"     "chr11:5275247-5276247"    
 [7] "chr11:5279747-5281947"     "chr11:5282047-5282647"    
 [9] "chr11:5283047-5283447"     "chr11:5283847-5286747"    
[11] "chr11:5287647-5288747"     "chr11:5503747-5506176"    
[13] "chr11:33868682-33871379"   "chr11:33880468-33883368"  
[15] "chr11:33936760-33938068"   "chr11:33940268-33943268"  
[17] "chr11:33943668-33946700"   "chr11:33947368-33947868"  
[19] "chr11:34050971-34054171"   "chr11:34085610-34086871"  
[21] "chr11:34437468-34440568"   "chr11:61791528-61794000"  
[23] "chr11:61796643-61797563"   "chr11:61813200-61818400"  
[25] "chr11:61832923-61836038"   "chr11:61841200-61842900"  
[27] "chr11:61864500-61864800"   "chr11:61868900-61873100"  
[29] "chr11:61890378-61892453"   "chr11:61892600-61893098"  
[31] "chr12:111993263-111995626" "chr12:112005226-112006426"
[33] "chr12:11201

In [35]:
lst = lst_region_assay_type
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = "CRISPRi-HCRFF"
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-HCRFF,ASTARR,74,19647,25,0.34
CRISPRi-HCRFF,ATAC,74,64076,55,0.74
CRISPRi-HCRFF,CRISPRi-Growth,74,6171,17,0.23
CRISPRi-HCRFF,CRISPRi-HCRFF,74,74,74,1.0
CRISPRi-HCRFF,ENCODE-E2G_Benchmark,74,384,15,0.2
CRISPRi-HCRFF,LMPRA,74,38993,29,0.39
CRISPRi-HCRFF,TMPRA,74,5686,32,0.43
CRISPRi-HCRFF,WSTARR,74,128794,28,0.38


In [37]:
lst = lst_region_assay_group1
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = "CRISPRi-HCRFF"
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-HCRFF,ATAC,74,64076,55,0.74
CRISPRi-HCRFF,CRISPR,74,6572,74,1.0
CRISPRi-HCRFF,MPRA,74,44039,43,0.58
CRISPRi-HCRFF,STARR,74,137332,35,0.47


In [38]:
lst = lst_region_assay_group2
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = "CRISPRi-HCRFF"
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-HCRFF,ATAC,74,64076,55,0.74
CRISPRi-HCRFF,CRISPR,74,6572,74,1.0
CRISPRi-HCRFF,STARR/MPRA,74,162577,47,0.64


## Question 02

```
For all our identified peaks in CRISPRi-Growth
- Z % are supported by STARR/MPRA
```

In [40]:
lst = lst_region_assay_type
txt = "CRISPRi-Growth"
vec = lst[[txt]]

vec_txt_region_query = vec
print(length(vec))
print(head(vec))

[1] 6171
[1] "chr1:605090-605823"   "chr1:826642-828050"   "chr1:964946-965136"  
[4] "chr1:995761-996190"   "chr1:1067780-1070953" "chr1:1155362-1155639"


In [41]:
lst = lst_region_assay_type
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-Growth,ASTARR,6171,19647,1353,0.22
CRISPRi-Growth,ATAC,6171,64076,4882,0.79
CRISPRi-Growth,CRISPRi-Growth,6171,6171,6171,1.0
CRISPRi-Growth,CRISPRi-HCRFF,6171,74,17,0.0
CRISPRi-Growth,ENCODE-E2G_Benchmark,6171,384,33,0.01
CRISPRi-Growth,LMPRA,6171,38993,2113,0.34
CRISPRi-Growth,TMPRA,6171,5686,48,0.01
CRISPRi-Growth,WSTARR,6171,128794,2299,0.37


In [42]:
lst = lst_region_assay_group1
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-Growth,ATAC,6171,64076,4882,0.79
CRISPRi-Growth,CRISPR,6171,6572,6171,1.0
CRISPRi-Growth,MPRA,6171,44039,2137,0.35
CRISPRi-Growth,STARR,6171,137332,2660,0.43


In [43]:
lst = lst_region_assay_group2
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
CRISPRi-Growth,ATAC,6171,64076,4882,0.79
CRISPRi-Growth,CRISPR,6171,6572,6171,1.0
CRISPRi-Growth,STARR/MPRA,6171,162577,3238,0.52


## Question 03

```
For our identified accessible regions (ATAC peaks),
U % are shown to have regulatory activity in STARR/MPRA
U1 % candidate enhancers
U2 % candidate silencers
V % are shown to change the cell fitness in CRISPRi-Growth
```

In [44]:
lst = lst_region_assay_type
txt = "ATAC"
vec = lst[[txt]]

vec_txt_region_query = vec
print(length(vec))
print(head(vec))

[1] 64076
[1] "chr1:136071-137429" "chr1:180982-182087" "chr1:183239-184602"
[4] "chr1:605090-605823" "chr1:777949-779437" "chr1:818602-819380"


In [45]:
lst = lst_region_assay_type
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
ATAC,ASTARR,64076,19647,18064,0.28
ATAC,ATAC,64076,64076,64076,1.0
ATAC,CRISPRi-Growth,64076,6171,4882,0.08
ATAC,CRISPRi-HCRFF,64076,74,55,0.0
ATAC,ENCODE-E2G_Benchmark,64076,384,359,0.01
ATAC,LMPRA,64076,38993,33702,0.53
ATAC,TMPRA,64076,5686,1111,0.02
ATAC,WSTARR,64076,128794,38091,0.59


In [46]:
lst = lst_region_assay_group1
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
ATAC,ATAC,64076,64076,64076,1.0
ATAC,CRISPR,64076,6572,5239,0.08
ATAC,MPRA,64076,44039,34448,0.54
ATAC,STARR,64076,137332,45441,0.71


In [47]:
lst = lst_region_assay_group2
lst = lapply(lst, function(vec_txt_region_target){
    vec = (vec_txt_region_query %in% vec_txt_region_target)
    res = c(
        "Length_Query"  = length(vec_txt_region_query),
        "Length_Target" = length(vec_txt_region_target),
        "Intersect"     = sum(vec),
        "Frequency"     = round(mean(vec), 2)
    )
    return(res)
})

dat = bind_rows(lst, .id = "Assay_Target")
dat$Assay_Query = txt
dat = dat %>% dplyr::select(Assay_Query, Assay_Target, Length_Query, Length_Target, Intersect, Frequency)
dat

Assay_Query,Assay_Target,Length_Query,Length_Target,Intersect,Frequency
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
ATAC,ATAC,64076,64076,64076,1.0
ATAC,CRISPR,64076,6572,5239,0.08
ATAC,STARR/MPRA,64076,162577,62238,0.97
