**Set environment**

In [3]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



## Prepare

**Set global variable**

In [4]:
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}

fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 


**View files: FCC region coverage**

In [5]:
txt_folder = VEC_TXT_FOLDER[1]
txt_fdiry  = file.path(FD_RES, "region_coverage_fcc_score", txt_folder, "summary")
txt_fname  = "region.coverage.score.*.tsv"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for (txt in vec) {cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_coverage_fcc_score/fcc_astarr_macs_input_overlap/summary/region.coverage.score.atac.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_coverage_fcc_score/fcc_astarr_macs_input_overlap/summary/region.coverage.score.fcc.final.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_coverage_fcc_score/fcc_astarr_macs_input_overlap/summary/region.coverage.score.fcc.test.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_coverage_fcc_score/fcc_astarr_macs_input_overlap/summary/region.coverage.score.fcc.total.tsv 


**View files: STARR/MPRA CREs**

In [6]:
txt_fdiry = file.path(FD_RES, "region_annotation", "*", "summary")
txt_fname = "region.annotation.fcc_starrmpra_junke.group.label.tsv"
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_overlap/summary/region.annotation.fcc_starrmpra_junke.group.label.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_union/summary/region.annotation.fcc_starrmpra_junke.group.label.tsv 


## Import data

### Import FCC region coverage scores

In [7]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry  = file.path(FD_RES, "region_coverage_fcc_score", txt_folder, "summary")
    txt_fname = "region.coverage.score.atac.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_score_atac_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
head(dat, 3)

$fcc_astarr_macs_input_overlap
[1] 150041     12

$fcc_astarr_macs_input_union
[1] 246852     12



Chrom,ChromStart,ChromEnd,Region,Score,Score_Label,Score_Quantile,Zcore,Zcore_Label,Assay_Folder,Assay_Group,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
chr1,10038,10405,chr1:10038-10405,3.940038,ASTARR Input TPM,Q3,-0.26087,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC
chr1,14282,14614,chr1:14282-14614,2.841707,ASTARR Input TPM,Q2,-0.6570534,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC
chr1,16025,16338,chr1:16025-16338,3.830812,ASTARR Input TPM,Q3,-0.296097,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC


**Check: Count table**

In [8]:
lst = lst_dat_region_score_atac_import
lst = lapply(lst, function(dat){
    res = table(dat$Assay_Label, dnn=c("Assay"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Assay,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
ATAC,150041,246852


### Import STARR/MPRA CREs

In [9]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = "region.annotation.fcc_starrmpra_junke.group.label.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_fcc_starrmpra_group_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 150047      9

$fcc_astarr_macs_input_union
[1] 246862      9



Chrom,ChromStart,ChromEnd,Region,Distance2TSS,TSS_Proximity,TSS,Direction_Assay,Group
chr1,10038,10405,chr1:10038-10405,1469,Proximal,0,Inactive,Proximal:Inactive
chr1,14282,14614,chr1:14282-14614,2409,Distal,0,Inactive,Distal:Inactive
chr1,16025,16338,chr1:16025-16338,1099,Proximal,0,Inactive,Proximal:Inactive


**Check: Count table**

In [10]:
lst = lst_dat_region_annot_fcc_starrmpra_group_import
lst = lapply(lst, function(dat){
    res = table(dat$Group, dnn=c("Group"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Group,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
Distal:Active,10578,11362
Distal:Inactive,117528,208972
Distal:Repressive,1364,2407
Proximal:Active,6019,6073
Proximal:Inactive,14173,17587
Proximal:Repressive,385,461


## Arrange table

In [11]:
### init: set assay and cnames
vec_txt_cname = c("Chrom", "ChromStart", "ChromEnd", "Region")
vec_txt_assay = c("ATAC")

### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### get tables
    dat_region_group = lst_dat_region_annot_fcc_starrmpra_group_import[[txt_folder]]
    dat_region_score = lst_dat_region_score_atac_import[[txt_folder]]
    
    ### get regions screened by starr/mpra
    dat = dat_region_score
    dat = dat %>% dplyr::filter(Assay_Label %in% vec_txt_assay)
    vec = unique(dat$Region)
    
    dat_region_score_subset = dat
    vec_txt_region_screen   = vec
    
    ### filter and annotate starr/mpra cres
    dat = dat_region_group
    dat = dat %>% dplyr::filter(Region %in% vec_txt_region_screen) 

    ### join annotation with score
    dat = dat %>%
        dplyr::inner_join(
            dat_region_score_subset, 
            by = vec_txt_cname,
            relationship = "many-to-many"
        )
    return(dat)
})

### assign and show
lst_dat_region_merge = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 150046     17

$fcc_astarr_macs_input_union
[1] 246862     17



Chrom,ChromStart,ChromEnd,Region,Distance2TSS,TSS_Proximity,TSS,Direction_Assay,Group,Score,Score_Label,Score_Quantile,Zcore,Zcore_Label,Assay_Folder,Assay_Group,Assay_Label
chr1,10038,10405,chr1:10038-10405,1469,Proximal,0,Inactive,Proximal:Inactive,3.940038,ASTARR Input TPM,Q3,-0.26087,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC
chr1,14282,14614,chr1:14282-14614,2409,Distal,0,Inactive,Distal:Inactive,2.841707,ASTARR Input TPM,Q2,-0.6570534,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC
chr1,16025,16338,chr1:16025-16338,1099,Proximal,0,Inactive,Proximal:Inactive,3.830812,ASTARR Input TPM,Q3,-0.296097,scale(log2(TPM + 1),STARR_ATAC_K562_Reddy_KS91,ASTARR_KS91,ATAC


## Export results

In [12]:
for (txt_folder in VEC_TXT_FOLDER){
    ### get tables
    dat_region_merge = lst_dat_region_merge[[txt_folder]]
    
    ### set file directory
    txt_fdiry  = file.path(FD_RES, "analysis_fcc_comparison", txt_folder)
    txt_fname = "region.annotation.fcc_starrmpra_group.coverage.atac.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    ### write table
    dat = dat_region_merge
    write_tsv(dat, txt_fpath)
}