**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_CombEffect_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_CombEffect_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_CombEffect_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_CombEffect_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_CombEffect_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_CombEffect_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_CombEffect_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_CombEffect_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_CombEffect_ENCODE_FCC/references 



## Helper functions

**Define functions**

In [27]:
GROUPS  = c("Input", "Output")
SAMPLES = c(
    paste0("Input.rep",  1:6),
    paste0("Output.rep", 1:6))

get_info = function(string, patterns){
    idx = str_detect(string = string, pattern = patterns)
    return(patterns[idx])
}

get_group  = function(strings){
    res = sapply(strings, function(string){get_info(string, GROUPS)})
    return(res)
}

get_sample = function(strings){
    res = sapply(strings, function(string){get_info(string, SAMPLES)})
    return(res)
}

**Test functions**

In [24]:
txt = "ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz"
get_info(txt, SAMPLES)

In [26]:
txt = "ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz"
get_group(txt)

## Metadata for ASTARR fragment counts

Get the file names for fragment counts

In [35]:
TXT_ASSAY  = "STARR_ATAC_K562_Reddy_KS91"
TXT_PREFIX = "ASTARRseq_K562_KS91"
TXT_FOLDER = "fragment_counts"

txt_fdiry  = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER)
txt_fname  = "*WGS*bed.gz"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

for (txt in vec_txt_fname){cat(txt, "\n")}

ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Input.rep2.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Input.rep3.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Input.rep4.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Input.rep5.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Input.rep6.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Output.rep1.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Output.rep2.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Output.rep3.WGS.unstranded.bed.gz 
ASTARRseq_K562_KS91.hg38.Output.rep4.WGS.unstranded.bed.gz 


Create metadata for the files

In [36]:
dat = data.frame(
    FName  = vec_txt_fname,
    Assay  = TXT_ASSAY,
    Prefix = TXT_PREFIX
)
dat = dat %>%
    dplyr::mutate(
        Group  = get_group(FName),
        Sample = get_sample(FName)
    )

dat_metadata_astarr = dat
fun_display_table(dat)

FName,Assay,Prefix,Group,Sample
ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep1
ASTARRseq_K562_KS91.hg38.Input.rep2.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep2
ASTARRseq_K562_KS91.hg38.Input.rep3.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep3
ASTARRseq_K562_KS91.hg38.Input.rep4.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep4
ASTARRseq_K562_KS91.hg38.Input.rep5.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep5
ASTARRseq_K562_KS91.hg38.Input.rep6.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep6
ASTARRseq_K562_KS91.hg38.Output.rep1.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep1
ASTARRseq_K562_KS91.hg38.Output.rep2.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep2
ASTARRseq_K562_KS91.hg38.Output.rep3.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep3
ASTARRseq_K562_KS91.hg38.Output.rep4.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep4


Save results

In [40]:
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER, "summary")
txt_fname = "metadata.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

txt = paste("mkdir -p", txt_fdiry)
system(txt)

dat = dat_metadata_astarr
write_tsv(dat, txt_fpath)

## Metadata for WSTARR fragment counts

Get the file names for fragment counts

In [41]:
TXT_ASSAY  = "STARR_WHG_K562_Reddy_A001"
TXT_PREFIX = "WSTARRseq_K562_A001"
TXT_FOLDER = "fragment_counts"

txt_fdiry  = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER)
txt_fname  = "*WGS*bed.gz"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

for (txt in vec_txt_fname){cat(txt, "\n")}

WSTARRseq_K562_A001.hg38.Input.rep1.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Input.rep2.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Input.rep3.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Input.rep4.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Output.rep1.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Output.rep2.WGS.unstranded.bed.gz 
WSTARRseq_K562_A001.hg38.Output.rep3.WGS.unstranded.bed.gz 


Create metadata for the files

In [42]:
dat = data.frame(
    FName  = vec_txt_fname,
    Assay  = TXT_ASSAY,
    Prefix = TXT_PREFIX
)
dat = dat %>%
    dplyr::mutate(
        Group  = get_group(FName),
        Sample = get_sample(FName)
    )

dat_metadata_astarr = dat
fun_display_table(dat)

FName,Assay,Prefix,Group,Sample
WSTARRseq_K562_A001.hg38.Input.rep1.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep1
WSTARRseq_K562_A001.hg38.Input.rep2.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep2
WSTARRseq_K562_A001.hg38.Input.rep3.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep3
WSTARRseq_K562_A001.hg38.Input.rep4.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep4
WSTARRseq_K562_A001.hg38.Output.rep1.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep1
WSTARRseq_K562_A001.hg38.Output.rep2.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep2
WSTARRseq_K562_A001.hg38.Output.rep3.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep3


Save results

In [43]:
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER, "summary")
txt_fname = "metadata.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

txt = paste("mkdir -p", txt_fdiry)
system(txt)

dat = dat_metadata_astarr
write_tsv(dat, txt_fpath)