**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



## Prepare

**Set global variable**

In [2]:
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}

fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 


In [3]:
TXT_FNAME_ANNOT = "region.annotation.fcc_starrmpra_junke.tsv"

**View files**

In [4]:
txt_fdiry = file.path(FD_RES, "region_annotation", "*", "summary")
txt_fname = TXT_FNAME_ANNOT
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_overlap/summary/region.annotation.fcc_starrmpra_junke.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_annotation/fcc_astarr_macs_input_union/summary/region.annotation.fcc_starrmpra_junke.tsv 


## Import data

**Import region pairs**

In [5]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = TXT_FNAME_ANNOT
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 151862     12

$fcc_astarr_macs_input_union
[1] 196065     12



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Group,Label,Direction_Assay,Direction_Label,Region_Annot,Region_Count
chr1,10038,10405,chr1:10038-10405,fcc_astarr_macs_input_overlap,fcc_starrmpra_junke_astarr,ASTARR,ASTARR_R,Repressive,R,chr1:10010-10430,1
chr1,16025,16338,chr1:16025-16338,fcc_astarr_macs_input_overlap,fcc_starrmpra_junke_astarr,ASTARR,ASTARR_R,Repressive,R,chr1:16220-16340,1
chr1,17288,17689,chr1:17288-17689,fcc_astarr_macs_input_overlap,fcc_starrmpra_junke_astarr,ASTARR,ASTARR_R,Repressive,R,chr1:17230-17440;chr1:17610-17720,2


## Summarize the direction of each assay

In [6]:
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    ### summarize direction for each region per assay
    dat = dat %>% 
        dplyr::group_by(Chrom, ChromStart, ChromEnd, Region, Group) %>% 
        dplyr::summarize(
            Direction_Assay = paste(sort(unique(Direction_Assay)), collapse = ":"),
            Direction_Label = paste(sort(unique(Direction_Label)), collapse = ""),
            Count = n(),
            .groups   = "drop"
        ) %>%
        dplyr::mutate(Label = paste(Group, Direction_Label, sep = "_")) %>%
        dplyr::select(
            Chrom, ChromStart, ChromEnd, Region, Group, Label, 
            Direction_Assay, 
            Direction_Label, 
            Count
        )
    return(dat)
})

### assign and show
lst_dat_region_annot_result = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 141847      9

$fcc_astarr_macs_input_union
[1] 183250      9



Chrom,ChromStart,ChromEnd,Region,Group,Label,Direction_Assay,Direction_Label,Count
chr1,10038,10405,chr1:10038-10405,ASTARR,ASTARR_R,Repressive,R,1
chr1,16025,16338,chr1:16025-16338,ASTARR,ASTARR_R,Repressive,R,1
chr1,17288,17689,chr1:17288-17689,ASTARR,ASTARR_R,Repressive,R,1


**Check: AR should have count == 2, while A and R should have count == 1**

In [7]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::select(Direction_Assay, Count) %>% dplyr::distinct()
    return(dat)
})
dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Direction_Assay, Count)
dat

Region,Active,Active:Repressive,Repressive
<chr>,<int>,<int>,<int>
fcc_astarr_macs_input_overlap,1,2,1
fcc_astarr_macs_input_union,1,2,1


In [8]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::select(Label, Count) %>% dplyr::distinct()
    return(dat)
})
dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Label, Count)
dat

Region,ASTARR_A,ASTARR_AR,ASTARR_R,LMPRA_A,LMPRA_AR,LMPRA_R,TMPRA_A,TMPRA_AR,TMPRA_R,WSTARR_A,WSTARR_AR,WSTARR_R
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
fcc_astarr_macs_input_overlap,1,2,1,1,2,1,1,2,1,1,2,1
fcc_astarr_macs_input_union,1,2,1,1,2,1,1,2,1,1,2,1


**Explore: count of assay direction**

In [9]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    res = table(dat$Group, dat$Direction_Assay, dnn=c("Group", "Direction"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Direction, Freq)
fun_display_table(dat)

Region,Group,Active,Active:Repressive,Repressive
fcc_astarr_macs_input_overlap,ASTARR,17042,9119,64604
fcc_astarr_macs_input_overlap,LMPRA,20660,32,213
fcc_astarr_macs_input_overlap,TMPRA,835,12,1
fcc_astarr_macs_input_overlap,WSTARR,26140,852,2337
fcc_astarr_macs_input_union,ASTARR,18245,11381,92196
fcc_astarr_macs_input_union,LMPRA,21985,37,252
fcc_astarr_macs_input_union,TMPRA,1082,21,3
fcc_astarr_macs_input_union,WSTARR,31974,1376,4698


**Check count by label of direction**

In [10]:
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    res = table(dat$Group, dat$Direction_Label, dnn=c("Group", "Direction"))
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Direction, Freq)
fun_display_table(dat)

Region,Group,A,AR,R
fcc_astarr_macs_input_overlap,ASTARR,17042,9119,64604
fcc_astarr_macs_input_overlap,LMPRA,20660,32,213
fcc_astarr_macs_input_overlap,TMPRA,835,12,1
fcc_astarr_macs_input_overlap,WSTARR,26140,852,2337
fcc_astarr_macs_input_union,ASTARR,18245,11381,92196
fcc_astarr_macs_input_union,LMPRA,21985,37,252
fcc_astarr_macs_input_union,TMPRA,1082,21,3
fcc_astarr_macs_input_union,WSTARR,31974,1376,4698


## Convert to matrix

In [11]:
### convert to wide format
lst = lst_dat_region_annot_result
lst = lapply(lst, function(dat){
    dat = dat %>% 
        dplyr::select(Chrom, ChromStart, ChromEnd, Region, Label) %>%
        dplyr::mutate(Value = 1) %>%
        tidyr::spread(Label, Value) %>%
        replace(is.na(.), 0) %>%
        dplyr::arrange(Chrom, ChromStart, ChromEnd)
    return(dat)    
})

### assign and show
lst_dat_region_annot_matrix = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 99749    16

$fcc_astarr_macs_input_union
[1] 135016     16



Chrom,ChromStart,ChromEnd,Region,ASTARR_A,ASTARR_AR,ASTARR_R,LMPRA_A,LMPRA_AR,LMPRA_R,TMPRA_A,TMPRA_AR,TMPRA_R,WSTARR_A,WSTARR_AR,WSTARR_R
chr1,10038,10405,chr1:10038-10405,0,0,1,0,0,0,0,0,0,0,0,0
chr1,16025,16338,chr1:16025-16338,0,0,1,0,0,0,0,0,0,0,0,0
chr1,17288,17689,chr1:17288-17689,0,0,1,0,0,0,0,0,0,0,0,0


## Save results

In [12]:
for (txt_folder in VEC_TXT_FOLDER){

    ### get tables
    dat_region_annot_result = lst_dat_region_annot_result[[txt_folder]]
    dat_region_annot_matrix = lst_dat_region_annot_matrix[[txt_folder]]
    
    ### set file directory
    txt_fdiry = file.path(
        FD_RES, 
        "region_annotation", 
        txt_folder,
        "summary"
    )
    
    ### write region annotation (long format)
    txt_fname = "region.annotation.fcc_starrmpra_junke.merge_direction.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    dat = dat_region_annot_result
    dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
    write_tsv(dat, txt_fpath)

    ### write region annotation (wide format)
    txt_fname = "matrix.annotation.fcc_starrmpra_junke.merge_direction.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    dat = dat_region_annot_matrix
    dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
    write_tsv(dat, txt_fpath)
}

## Sanity Check

**Import tables**

In [13]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_annotation", txt_folder, "summary")
    txt_fname = "matrix.annotation.fcc_starrmpra_junke.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_check = lst

res = lapply(lst, dim)
print(res)

$fcc_astarr_macs_input_overlap
[1] 99749    12

$fcc_astarr_macs_input_union
[1] 135016     12



**Show tables**

In [14]:
dat1 = lst_dat_region_annot_matrix[[1]]
print(dim(dat1))
fun_display_table(head(dat1, 3))

[1] 99749    16


Chrom,ChromStart,ChromEnd,Region,ASTARR_A,ASTARR_AR,ASTARR_R,LMPRA_A,LMPRA_AR,LMPRA_R,TMPRA_A,TMPRA_AR,TMPRA_R,WSTARR_A,WSTARR_AR,WSTARR_R
chr1,10038,10405,chr1:10038-10405,0,0,1,0,0,0,0,0,0,0,0,0
chr1,16025,16338,chr1:16025-16338,0,0,1,0,0,0,0,0,0,0,0,0
chr1,17288,17689,chr1:17288-17689,0,0,1,0,0,0,0,0,0,0,0,0


In [15]:
dat2 = lst_dat_region_annot_check[[1]]
print(dim(dat2))
fun_display_table(head(dat2, 3))

[1] 99749    12


Chrom,ChromStart,ChromEnd,Region,ASTARR_A,ASTARR_R,LMPRA_A,LMPRA_R,TMPRA_A,TMPRA_R,WSTARR_A,WSTARR_R
chr1,10038,10405,chr1:10038-10405,0,1,0,0,0,0,0,0
chr1,16025,16338,chr1:16025-16338,0,1,0,0,0,0,0,0
chr1,17288,17689,chr1:17288-17689,0,1,0,0,0,0,0,0


**Check**

In [16]:
vec1 = sort(dat1$Region)
vec2 = sort(dat2$Region)
print(all(vec1 == vec2))

[1] TRUE


In [17]:
vec1 = dat1$ASTARR_AR
tmp  = dat2 %>% dplyr::select(ASTARR_A, ASTARR_R)
vec2 = apply(tmp, 1, sum)
vec2 = as.integer(vec2 == 2)
print(all(vec1 == vec2))

[1] TRUE


In [18]:
vec1 = dat1$WSTARR_AR
tmp  = dat2 %>% dplyr::select(WSTARR_A, WSTARR_R)
vec2 = apply(tmp, 1, sum)
vec2 = as.integer(vec2 == 2)
print(all(vec1 == vec2))

[1] TRUE


In [19]:
vec1 = dat1$LMPRA_AR
tmp  = dat2 %>% dplyr::select(LMPRA_A, LMPRA_R)
vec2 = apply(tmp, 1, sum)
vec2 = as.integer(vec2 == 2)
print(all(vec1 == vec2))

[1] TRUE


In [20]:
vec1 = dat1$TMPRA_AR
tmp  = dat2 %>% dplyr::select(TMPRA_A, TMPRA_R)
vec2 = apply(tmp, 1, sum)
vec2 = as.integer(vec2 == 2)
print(all(vec1 == vec2))

[1] TRUE
