**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



## Prepare

**Set global variable**

In [2]:
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}

fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 


**View files: FCC region coverage**

In [3]:
txt_folder = VEC_TXT_FOLDER[1]
txt_fdiry  = file.path(FD_RES, "analysis_fcc_correlation", txt_folder)
txt_fname  = "region.coverage.score.*.tsv"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for (txt in vec) {cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/analysis_fcc_correlation/fcc_astarr_macs_input_overlap/region.coverage.score.concat.tsv 


**View files: STARR/MPRA CREs**

In [4]:
txt_fdiry = file.path(FD_RES, "region_annotation", "*", "summary")
txt_fname = "region.annotation.fcc_starrmpra_junke.group.tsv"
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_overlap/summary/region.annotation.fcc_starrmpra_junke.group.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_union/summary/region.annotation.fcc_starrmpra_junke.group.tsv 


## Import data

### Import FCC region coverage scores

In [5]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry  = file.path(FD_RES, "analysis_fcc_correlation", txt_folder)
    txt_fname = "region.coverage.score.concat.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_score_fcc_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
head(dat, 3)

$fcc_astarr_macs_input_overlap
[1] 586004      8

$fcc_astarr_macs_input_union
[1] 889739      8



Chrom,ChromStart,ChromEnd,Region,Score,Score_Label,Assay_Group,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr1,10038,10405,chr1:10038-10405,-3.5573621,Scale(Log2FC),ASTARR_KS91,ASTARR
chr1,14282,14614,chr1:14282-14614,0.0361388,Scale(Log2FC),ASTARR_KS91,ASTARR
chr1,16025,16338,chr1:16025-16338,-2.1376311,Scale(Log2FC),ASTARR_KS91,ASTARR


**Check: Count table**

In [6]:
lst = lst_dat_region_score_fcc_import
lst = lapply(lst, function(dat){
    res = table(dat$Assay_Label, dnn=c("Assay"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Assay,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
ASTARR,150040,246850
ATAC,150041,246852
CRISPRi-Growth,72743,80288
CRISPRi-HCRFF,925,1330
ENCODE-rE2G Benchmark,3035,3169
LMPRA,61478,68497
TMPRA,1148,1722
WSTARR,146594,241031


### Import STARR/MPRA CREs

In [7]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = "region.annotation.fcc_starrmpra_junke.group.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_fcc_starrmpra_group_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 18346     5

$fcc_astarr_macs_input_union
[1] 20303     5



Chrom,ChromStart,ChromEnd,Region,Group
chr1,778233,779389,chr1:778233-779389,Proximal:Active
chr1,958722,959968,chr1:958722-959968,Proximal:Active
chr1,960468,961615,chr1:960468-961615,Proximal:Active


**Check: Count table**

In [8]:
lst = lst_dat_region_annot_fcc_starrmpra_group_import
lst = lapply(lst, function(dat){
    res = table(dat$Group, dnn=c("Group"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Group,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
Distal:Active,10578,11362
Distal:Repressive,1364,2407
Proximal:Active,6019,6073
Proximal:Repressive,385,461


### Import STARR/MPRA/CRISPR CREs

In [34]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = "region.annotation.fcc_starrmpracrispr.assayvote.filtered.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_fcc_starrmpracrispr_cres_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 33953     6

$fcc_astarr_macs_input_union
[1] 39788     6



Chrom,ChromStart,ChromEnd,Region,Num_Assay,Assays
chr1,605104,605675,chr1:605104-605675,2,"ASTARR,CRISPRi-Growth"
chr1,778233,779389,chr1:778233-779389,3,"ASTARR,LMPRA,WSTARR"
chr1,818708,819335,chr1:818708-819335,2,"ASTARR,WSTARR"


**Check: Count table**

In [8]:
lst = lst_dat_region_annot_fcc_starrmpra_group_import
lst = lapply(lst, function(dat){
    res = table(dat$Group, dnn=c("Group"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Group,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
Distal:Active,10578,11362
Distal:Repressive,1364,2407
Proximal:Active,6019,6073
Proximal:Repressive,385,461


## Arrange table

In [39]:
### init: set assay and cnames
vec_txt_cname = c("Chrom", "ChromStart", "ChromEnd", "Region")

### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### get tables
    dat_region_annot_starrmpracrispr = lst_dat_region_annot_fcc_starrmpracrispr_cres_import[[txt_folder]]
    dat_region_annot_starrmpra       = lst_dat_region_annot_fcc_starrmpra_group_import[[txt_folder]]
    dat_region_score                 = lst_dat_region_score_fcc_import[[txt_folder]]

    ### join annotation with score (STARR/MPRA/CRISPR CREs)
    dat = dat_region_annot_starrmpracrispr
    dat = dat %>%
        dplyr::inner_join(
            dat_region_score, 
            by = vec_txt_cname,
            relationship = "many-to-many"
        )
    dat_region_merge_starrmpracrispr = dat
    
    ### join annotation with score (STARR/MPRA CREs)
    dat = dat_region_annot_starrmpra
    dat = dat %>%
        dplyr::inner_join(
            dat_region_score, 
            by = vec_txt_cname,
            relationship = "many-to-many"
        )
    dat_region_merge_starrmpra = dat
    
    ### full atac region
    dat = dat_region_score
    dat = dat %>% dplyr::mutate(Group = "ATAC")
    dat_region_merge_atac = dat
    
    ### full STARR/MPRA/CRISPR CREs
    dat = dat_region_merge_starrmpracrispr
    dat = dat %>% dplyr::mutate(Group = "STARRMPRACRISPR CREs")
    dat_region_group_pool_starrmpracrispr = dat

    ### full STARR/MPRA CREs
    dat = dat_region_merge_starrmpra
    dat = dat %>% dplyr::mutate(Group = "STARRMPRA CREs")
    dat_region_group_pool_starrmpra = dat
    
    ### split STARR/MPRA CREs
    dat = dat_region_merge_starrmpra
    dat_region_group_split_starrmpra = dat

    ### combine
    lst = list(
        dat_region_merge_atac,
        dat_region_group_pool_starrmpracrispr,
        dat_region_group_pool_starrmpra,
        dat_region_group_split_starrmpra
    )
    lst = lapply(lst, function(dat){
        dat = dat %>% 
            dplyr::select(Chrom:Region, Group, Score, Assay_Label) %>%
            dplyr::distinct()
        return(dat)
    })
    dat = bind_rows(lst)
    return(dat)
})

### assign and show
lst_dat_region_merge_fcc = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 917042      7

$fcc_astarr_macs_input_union
[1] 1257048       7



Chrom,ChromStart,ChromEnd,Region,Group,Score,Assay_Label
chr1,10038,10405,chr1:10038-10405,ATAC,-3.5573621,ASTARR
chr1,14282,14614,chr1:14282-14614,ATAC,0.0361388,ASTARR
chr1,16025,16338,chr1:16025-16338,ATAC,-2.1376311,ASTARR


**Check: Count table**

In [40]:
lst = lst_dat_region_merge_fcc
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::select(Region, Group) %>% dplyr::distinct()
    res = table(dat$Group, dnn=c("Group"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)

Group,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
ATAC,150041,246852
Distal:Active,10578,11362
Distal:Repressive,1364,2407
Proximal:Active,6019,6073
Proximal:Repressive,385,461
STARRMPRA CREs,18341,20293
STARRMPRACRISPR CREs,33953,39788


## Calculate Spearman correlations across assays with different region sets

**Set assays**

In [41]:
lst = lst_dat_region_merge_fcc
dat = lst[[1]]
unique(dat$Assay_Label)

In [42]:
vec = c("ATAC", "ASTARR", "LMPRA", "TMPRA", "WSTARR", "CRISPRi-Growth", "CRISPRi-HCRFF", "ENCODE-rE2G Benchmark")
names(vec) = vec

vec_txt_assay = vec

**Get assay pairs**

In [43]:
vec = vec_txt_assay
dat = combn(vec, 2)
dat = data.frame(dat)

dat_assay_comb2 = dat
fun_display_table(dat)

X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28
ATAC,ATAC,ATAC,ATAC,ATAC,ATAC,ATAC,ASTARR,ASTARR,ASTARR,ASTARR,ASTARR,ASTARR,LMPRA,LMPRA,LMPRA,LMPRA,LMPRA,TMPRA,TMPRA,TMPRA,TMPRA,WSTARR,WSTARR,WSTARR,CRISPRi-Growth,CRISPRi-Growth,CRISPRi-HCRFF
ASTARR,LMPRA,TMPRA,WSTARR,CRISPRi-Growth,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,LMPRA,TMPRA,WSTARR,CRISPRi-Growth,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,TMPRA,WSTARR,CRISPRi-Growth,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,WSTARR,CRISPRi-Growth,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,CRISPRi-Growth,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,CRISPRi-HCRFF,ENCODE-rE2G Benchmark,ENCODE-rE2G Benchmark


**Loop through each combination of assays**

In [44]:
lst = lst_dat_region_merge_fcc
lst = lapply(lst, function(dat_region_merge){
    
    ### Loop through each combination of assays
    lst = lapply(dat_assay_comb2, function(vec_txt_assay_pair){
        
        ### get assay pair
        txt_assay1 = vec_txt_assay_pair[1]
        txt_assay2 = vec_txt_assay_pair[2]
        
        ### filter by assay
        dat = dat_region_merge
        dat = dat %>% 
            dplyr::filter(Assay_Label %in% vec_txt_assay_pair) %>%
            dplyr::select(Region, Group, Score, Assay_Label) %>% 
            tidyr::spread(Assay_Label, Score) %>%
            na.omit
        
        ### split by region group
        lst = split(dat, dat$Group)
        lst = lapply(lst, function(dat){
            ### get scores foreach assay
            vec1 = dat[[txt_assay1]]
            vec2 = dat[[txt_assay2]]

            ### calculate correlation and summarize
            num  = cor(vec1, vec2, method = "spearman")
            res  = data.frame(
                "Assay1" = txt_assay1,
                "Assay2" = txt_assay2,
                "scor"   = num,
                "size"   = length(vec1)
            )
            return(res)
        })

        ### concat across region groups
        dat = bind_rows(lst, .id = "Group")
        return(dat)
    })

    ### concat across assay pairs
    dat = bind_rows(lst)
    return(dat)
})

### assign and show
lst_dat_assay_scor = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 187   5

$fcc_astarr_macs_input_union
[1] 187   5



Group,Assay1,Assay2,scor,size
ATAC,ATAC,ASTARR,0.1883749,150040
Distal:Active,ATAC,ASTARR,0.0698688,10578
Distal:Repressive,ATAC,ASTARR,0.0424431,1364


**Explore results**

In [46]:
### get table
idx = "fcc_astarr_macs_input_union"
lst = lst_dat_assay_scor
dat = lst[[idx]]

### set region order
vec_txt_group = c(
    "ATAC", 
    "STARRMPRACRISPR CREs", "STARRMPRA CREs", 
    "Proximal:Active",      "Distal:Active", 
    "Proximal:Repressive",  "Distal:Repressive")

### show results
dat = dat %>% 
    dplyr::mutate(
        Group  = factor(Group,  levels = vec_txt_group),
        Assay1 = factor(Assay1, levels = vec_txt_assay),
        Assay2 = factor(Assay2, levels = vec_txt_assay)
    ) %>%
    dplyr::select(Assay1, Assay2, Group, scor) %>%
    tidyr::spread(Group, scor) %>%
    dplyr::arrange(Assay1, Assay2)

fun_display_table(dat)

Assay1,Assay2,ATAC,STARRMPRACRISPR CREs,STARRMPRA CREs,Proximal:Active,Distal:Active,Proximal:Repressive,Distal:Repressive
ATAC,ASTARR,0.1393644,0.2031285,0.1227092,0.0225334,0.0671609,-0.1559696,0.0627735
ATAC,LMPRA,0.28433,0.2190861,0.2837781,0.1400627,0.0096247,0.1177146,0.0956436
ATAC,TMPRA,0.4500697,0.3989227,0.3817232,0.3933659,0.1530694,0.9,0.0692308
ATAC,WSTARR,0.2963266,0.2529903,0.2730007,0.2626693,-0.0773411,0.2964554,0.1868186
ATAC,CRISPRi-Growth,0.0878955,0.1139104,0.16547,0.0353122,0.0189838,-0.0060901,0.0502114
ATAC,CRISPRi-HCRFF,0.0659424,0.0685745,0.0962162,0.0715439,0.1686488,,0.1142857
ATAC,ENCODE-rE2G Benchmark,-0.1765386,-0.1909857,-0.1947479,0.1554656,-0.2021628,1.0,-0.0653236
ASTARR,LMPRA,0.2289869,0.1079349,-0.0076172,-0.0187693,-0.0976811,-0.0151637,0.3381758
ASTARR,TMPRA,0.2644954,0.2405247,0.206553,0.3215295,0.0820207,0.1,0.1868668
ASTARR,WSTARR,0.3026786,0.4190658,0.5097476,0.4104783,0.3295641,0.2022053,0.3374302


## Export results

In [47]:
for (txt_folder in VEC_TXT_FOLDER){
    ### get tables
    dat_result = lst_dat_assay_scor[[txt_folder]]
    
    ### set file directory
    txt_fdiry  = file.path(FD_RES, "analysis_fcc_comparison", txt_folder)
    txt_fname = "result.comparison.fcc_starrmpra_group.correlation.fcc.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    ### write table
    dat = dat_result
    write_tsv(dat, txt_fpath)
}