**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



**Set global variable**

In [2]:
TXT_FOLDER_REGION = "fcc_table"

## Import data

In [3]:
### set file path
txt_folder = TXT_FOLDER_REGION
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fname  = "K562.hg38.*.bed.gz"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

vec = vec_txt_fname
for(txt in vec){cat(txt, "\n")}

K562.hg38.atac.bed.gz 
K562.hg38.encode_e2g_benchmark.bed.gz 
K562.hg38.fcc_astarr_csaw.bed.gz 
K562.hg38.fcc_crispri_growth.bed.gz 
K562.hg38.fcc_crispri_hcrff.bed.gz 
K562.hg38.fcc_starrmpra_junke.bed.gz 
K562.hg38.tss.bed.gz 


In [4]:
### read table
vec_txt_cname = c(
    "Chrom", "ChromStart", "ChromEnd", "Group", "Label",
    "Assay", "Region", "Target", "Score", "NLog10P",
    "Method", "Source"
)

lst = lapply(vec_txt_fpath, function(txt_fpath){
    dat = read_tsv(txt_fpath, col_names = vec, show_col_types = FALSE)        
    colnames(dat) = vec_txt_cname
    return(dat)
})
dat = bind_rows(lst)

### assign and show
dat_region_import = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 895792     12


Chrom,ChromStart,ChromEnd,Group,Label,Assay,Region,Target,Score,NLog10P,Method,Source
chr1,10038,10405,ATAC,ATAC,ATAC,chr1:10038-10405,,3.940038,,MACS,Reddy Lab
chr1,14282,14614,ATAC,ATAC,ATAC,chr1:14282-14614,,2.841707,,MACS,Reddy Lab
chr1,16025,16338,ATAC,ATAC,ATAC,chr1:16025-16338,,3.830812,,MACS,Reddy Lab


## Define columns

In [5]:
### setup column description
dat = tribble(
    ~Name,        ~Note,
    "Chrom",      "Name of the chromosome",
    "ChromStart", "The starting position of the feature in the chromosome",
    "ChromEnd",   "The ending position of the feature in the chromosome",
    "Group",      "Region group",
    "Label",      "Region label",
    "Assay",      "Assay or annotation name",
    "Region",     "Region coordinate",
    "Target",     "Targeted genes or guides",
    "Score",      "Score assigned to a region.",
    "NLog10P",    "-log10 of P-value",
    "Method",     "Method of analysis",
    "Source",     "Dataset or data source"
)

### assign and show
dat_cname = dat
fun_display_table(dat)

Name,Note
Chrom,Name of the chromosome
ChromStart,The starting position of the feature in the chromosome
ChromEnd,The ending position of the feature in the chromosome
Group,Region group
Label,Region label
Assay,Assay or annotation name
Region,Region coordinate
Target,Targeted genes or guides
Score,Score assigned to a region.
NLog10P,-log10 of P-value


**Check**

In [6]:
vec1 = colnames(dat_region_import)
vec2 = dat_cname$Name
all(vec1 == vec2)

In [7]:
dat = dat_region_import
table(dat$Group)


        ASTARR           ATAC CRISPRi-Growth  CRISPRi-HCRFF  E2G-Benchmark 
        542786         150041           6242            113          10375 
         LMPRA          TMPRA            TSS         WSTARR 
         26133           6271          11892         141939 

In [8]:
dat = dat_region_import
head(dat)

Chrom,ChromStart,ChromEnd,Group,Label,Assay,Region,Target,Score,NLog10P,Method,Source
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>
chr1,10038,10405,ATAC,ATAC,ATAC,chr1:10038-10405,,3.940038,,MACS,Reddy Lab
chr1,14282,14614,ATAC,ATAC,ATAC,chr1:14282-14614,,2.841707,,MACS,Reddy Lab
chr1,16025,16338,ATAC,ATAC,ATAC,chr1:16025-16338,,3.830812,,MACS,Reddy Lab
chr1,17288,17689,ATAC,ATAC,ATAC,chr1:17288-17689,,6.198372,,MACS,Reddy Lab
chr1,28934,29499,ATAC,ATAC,ATAC,chr1:28934-29499,,4.064322,,MACS,Reddy Lab
chr1,115429,115969,ATAC,ATAC,ATAC,chr1:115429-115969,,15.096518,,MACS,Reddy Lab


In [24]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "ASTARR", Method == "Junke") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 189842      5
[1] 189510

ASTARR_A:junke ASTARR_R:junke 
         35505         154337 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  100.0   110.0   140.0   191.6   220.0  5510.0 


In [23]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "WSTARR", Method == "Junke") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 141939      5
[1] 140238

WSTARR_A:junke WSTARR_R:junke 
         79738          62201 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  100.0   110.0   150.0   213.4   260.0  2010.0 


In [26]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "TMPRA", Method == "Junke") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 6271    5
[1] 6271

TMPRA_A:junke TMPRA_R:junke 
         6017           254 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  199.0   199.0   199.0   251.7   299.0  2399.0 


In [31]:
6017/254

In [25]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "LMPRA", Method == "Junke") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 26133     5
[1] 26132

LMPRA_A:junke LMPRA_R:junke 
        25648           485 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  200.0   200.0   200.0   200.9   200.0   600.0 


In [30]:
25648/485

In [28]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "CRISPRi-Growth") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 6242    5
[1] 6242

Signif 
  6242 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   27.0   200.0   379.5   556.6   796.8  4467.0 


In [29]:
dat = dat_region_import
dat = dat %>% 
    dplyr::filter(Group == "CRISPRi-HCRFF") %>%
    dplyr::mutate(Length = ChromEnd-ChromStart) %>%
    dplyr::select(Region, Group, Label, Method, Length) %>%
    dplyr::distinct()
print(dim(dat))

vec = dat$Region
vec = unique(vec)
print(length(vec))

print(table(dat$Label))

print(summary(dat$Length))

[1] 113   5
[1] 102

CRISPRi-HCRFF:CAPRIN1     CRISPRi-HCRFF:CAT   CRISPRi-HCRFF:CD164 
                    4                     2                     5 
  CRISPRi-HCRFF:ERP29   CRISPRi-HCRFF:FADS1   CRISPRi-HCRFF:FADS2 
                    4                     5                     4 
  CRISPRi-HCRFF:FADS3    CRISPRi-HCRFF:FEN1   CRISPRi-HCRFF:GATA1 
                    6                     2                     3 
   CRISPRi-HCRFF:HBE1    CRISPRi-HCRFF:HBG1    CRISPRi-HCRFF:HBG2 
                    6                    13                    13 
  CRISPRi-HCRFF:HBS1L   CRISPRi-HCRFF:HDAC6    CRISPRi-HCRFF:LMO2 
                    1                     3                     6 
  CRISPRi-HCRFF:MEF2C     CRISPRi-HCRFF:MYB     CRISPRi-HCRFF:MYC 
                    3                     4                     5 
    CRISPRi-HCRFF:NMU    CRISPRi-HCRFF:PVT1 
                   11                    13 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    200     600    1000    1343    2