**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



## Prepare

**Set global variable**

In [2]:
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}

fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 


In [3]:
TXT_FNAME_ANNOT = "region.intersect.summary.genome_tss.tsv"

**View files**

In [4]:
txt_fdiry = file.path(FD_RES, "region_annotation", "*", "summary")
txt_fname = TXT_FNAME_ANNOT
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_overlap/summary/region.intersect.summary.genome_tss.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_union/summary/region.intersect.summary.genome_tss.tsv 


## Import data

**Import region pairs**

In [5]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = TXT_FNAME_ANNOT
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})
names(lst) = VEC_TXT_FOLDER

### assign and show
lst_dat_region_annot_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 26926    10

$fcc_astarr_macs_input_union
[1] 28665    10



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Group,Label,Region_Annot,Region_Count
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-1,chr1:17436-17437,1
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-2,chr1:17436-17437,1
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-3,chr1:17436-17437,1


**Explore: Group**

In [6]:
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    res = table(dat$Group, dnn = "Group")
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Group, Freq)
fun_display_table(dat)

Region,TSS_Pol2,TSS_Pol2_RNAseq
fcc_astarr_macs_input_overlap,16109,10817
fcc_astarr_macs_input_union,17697,10968


**Explore: Group x Region Count**

In [7]:
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    res = table(dat$Group, dat$Region_Count, dnn = c("Group", "Region_Count"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region_Count, Freq)
fun_display_table(dat)

Region,Group,1,2,3
fcc_astarr_macs_input_overlap,TSS_Pol2,16076,32,1
fcc_astarr_macs_input_overlap,TSS_Pol2_RNAseq,10803,14,0
fcc_astarr_macs_input_union,TSS_Pol2,17661,35,1
fcc_astarr_macs_input_union,TSS_Pol2_RNAseq,10954,14,0


**Check: Region count > 1**

In [8]:
lst = lst_dat_region_annot_import
dat = lst[[1]]
#dat %>% dplyr::filter(Region_Count > 1, Group == "TSS_Pol2_RNAseq") %>% head(3)
dat %>% dplyr::filter(Region_Count > 1, Group == "TSS_Pol2") %>% head(3)

Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Group,Label,Region_Annot,Region_Count
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
chr1,23167385,23170223,chr1:23167385-23170223,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,LUZP1,chr1:23168856-23168857;chr1:23168858-23168859,2
chr1,120069078,120069775,chr1:120069078-120069775,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,NOTCH2,chr1:120069662-120069663;chr1:120069703-120069704,2
chr1,145607800,145608529,chr1:145607800-145608529,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,GPR89A,chr1:145607959-145607960;chr1:145607987-145607988,2


**Import essential gene list**

In [9]:
### set file directory
txt_fdiry = file.path(FD_REF, "genome_gene")
txt_fname = "demap.v24Q2.AchillesCommonEssentialControls.csv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_csv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_gene_essential_import = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 1247    1


Gene
AAMP (14)
AARS1 (16)
AASDHPPT (60496)


## Arrange table

In [10]:
### arrange tables
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    #dat = dat %>% dplyr::filter(Group == "TSS_Pol2_RNAseq")
    dat = dat %>% dplyr::filter(Group == "TSS_Pol2")
    return(dat)
})

### assign and show
lst_dat_region_annot_arrange = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 16109    10

$fcc_astarr_macs_input_union
[1] 17697    10



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Group,Label,Region_Annot,Region_Count
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-1,chr1:17436-17437,1
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-2,chr1:17436-17437,1
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-3,chr1:17436-17437,1


**Arrange region intersected pairs**

In [11]:
dat = dat_gene_essential_import
dat = dat %>% tidyr::separate(col="Gene", into=c("Gene", "Note"), sep = " ")

dat_gene_essential_arrange = dat
fun_display_table(head(dat))

Gene,Note
AAMP,(14)
AARS1,(16)
AASDHPPT,(60496)
ABCB7,(22)
ABCE1,(6059)
ABCF1,(23)


**Explore: Group**

In [12]:
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    res = table(dat$Group, dnn = "Group")
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Group, Freq)
fun_display_table(dat)

Region,TSS_Pol2
fcc_astarr_macs_input_overlap,16109
fcc_astarr_macs_input_union,17697


## Process: Add gene essentiality

In [13]:
### get essential gene list
dat = dat_gene_essential_arrange
vec = dat$Gene
vec_txt_gene = vec


lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){

    ### assign essential label
    dat = dat %>% 
        dplyr::rowwise() %>%
        dplyr::mutate(Essential = Label %in% vec_txt_gene) %>%
        dplyr::ungroup()
    return(dat)
})

### assign and show
lst_dat_region_annot_result = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 16109    11

$fcc_astarr_macs_input_union
[1] 17697    11



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Group,Label,Region_Annot,Region_Count,Essential
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-1,chr1:17436-17437,1,False
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-2,chr1:17436-17437,1,False
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,genome_tss_pol2,TSS_Pol2,MIR6859-3,chr1:17436-17437,1,False


**Check missing values**

In [14]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){any(is.na(dat))})
print(lst)

$fcc_astarr_macs_input_overlap
[1] FALSE

$fcc_astarr_macs_input_union
[1] FALSE



**Explore: count essential genes**

In [15]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::select(Region, Group, Label, Essential) %>% dplyr::distinct()
    dat = dat %>% dplyr::mutate(Note = ifelse(Essential, "Essential:TRUE", "Essential:FALSE"))
    res = table(dat$Group, dat$Note, dnn = c("Group", "Note"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Note, Freq)
fun_display_table(dat)

Region,Group,Essential:FALSE,Essential:TRUE
fcc_astarr_macs_input_overlap,TSS_Pol2,14867,1242
fcc_astarr_macs_input_union,TSS_Pol2,16455,1242


## Matrix

In [16]:
### convert long to wide format
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    dat = dat %>%
        dplyr::select(Chrom, ChromStart, ChromEnd, Region) %>%
        dplyr::mutate(TSS = 1) %>%
        dplyr::distinct()
    return(dat)
})

### assign and show
lst_dat_region_annot_matrix = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 13732     5

$fcc_astarr_macs_input_union
[1] 15182     5



Chrom,ChromStart,ChromEnd,Region,TSS
chr1,17288,17689,chr1:17288-17689,1
chr1,28934,29499,chr1:28934-29499,1
chr1,778233,779389,chr1:778233-779389,1


**Check: dimention should match unique region**

In [17]:
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::select(Region, Group) %>% dplyr::distinct()
    res = table(dat$Group, dnn = c("Group"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Group, Freq)
fun_display_table(dat)

Region,TSS_Pol2
fcc_astarr_macs_input_overlap,13732
fcc_astarr_macs_input_union,15182


## Export results

In [18]:
for (txt_folder in VEC_TXT_FOLDER){

    ### get table
    dat_region_annot_result = lst_dat_region_annot_result[[txt_folder]]
    dat_region_annot_matrix = lst_dat_region_annot_matrix[[txt_folder]]
    
    ### set file directory
    txt_fdiry = file.path(
        FD_RES, 
        "region_annotation", 
        txt_folder,
        "summary"
    )
    
    ### write region annotation (long format)
    txt_fname = "region.annotation.genome_tss.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    dat = dat_region_annot_result
    dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
    write_tsv(dat, txt_fpath)

    ### write region annotation (wide format)
    txt_fname = "matrix.annotation.genome_tss.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    dat = dat_region_annot_matrix
    dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
    write_tsv(dat, txt_fpath)
}