**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



In [2]:
TXT_ASSAY  = "MPRA_Tiling_K562_Tewhey_Hannah"
TXT_FOLDER = "fragment_counts"

## Helper functions

**Define functions**

In [3]:
GROUPS  = c("Input", "Output", "Log2FC")
SAMPLES = c(
    paste0("Input.rep",  1:6),
    paste0("Output.rep", 1:6),
    "mean"
    #"Input.mean",
    #"Output.mean",
    #"Log2FC.mean"
)

get_info = function(string, patterns){
    idx = str_detect(string = string, pattern = patterns)
    return(patterns[idx])
}

get_group  = function(strings){
    res = sapply(strings, function(string){get_info(string, GROUPS)})
    return(res)
}

get_sample = function(strings){
    res = sapply(strings, function(string){get_info(string, SAMPLES)})
    return(res)
}

get_type = function(strings){
    res = sapply(strings, function(string){get_info(string, c("raw", "norm"))})
    return(res)
}

get_genome = function(strings){
    res = sapply(strings, function(string){get_info(string, c("hg19", "hg38", "unlifted"))})
    return(res)
}

get_prefix = function(strings){
    res = sapply(strings, function(string){
        lst = str_split(string, "\\.")
        vec = lst[[1]]
        txt = vec[1]
        return(txt)
    })
    return(res)
}

**Test functions**

In [4]:
txt = "TMPRA_K562_OL13_20220512.hg38.norm.Input.rep1.stranded_pos.bed.gz"
get_info(txt, SAMPLES)

In [5]:
txt = "TMPRA_K562_OL13_20220512.hg38.norm.Input.rep1.stranded_pos.bed.gz"
get_group(txt)

In [6]:
txt = "TMPRA_K562_OL13_20220512.hg38.norm.Input.rep1.stranded_pos.bed.gz"
get_genome(txt)

In [7]:
txt = "TMPRA_K562_OL13_20220512.hg38.norm.Input.rep1.stranded_pos.bed.gz"
get_prefix(txt)

## Metadata for TMPRA fragment counts

Get the file names for fragment counts

In [8]:
txt_fdiry  = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER)
txt_fname  = "*bed.gz"
txt_fglob  = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

for (txt in vec_txt_fname){cat(txt, "\n")}

TMPRA_K562_OL13_20220512.hg19.norm.Input.mean.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Log2FC.mean.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Output.mean.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep3.stranded_pos.bed.gz 
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep4.stranded_pos.b

Create metadata for the files

In [9]:
dat = data.frame(
    FName  = vec_txt_fname,
    Assay  = TXT_ASSAY
)
dat = dat %>%
    dplyr::mutate(
        Prefix = get_prefix(FName),
        Group  = get_group(FName),
        Sample = get_sample(FName),
        Type   = get_type(FName),
        Genome = get_genome(FName)
    )
dat = dat %>% 
    dplyr::filter(Genome %in% c("hg19", "hg38")) %>%
    dplyr::filter(Sample != "mean")

dat_metadata_tmpra = dat
fun_display_table(dat)

FName,Assay,Prefix,Group,Sample,Type,Genome
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep1,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep2,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep3,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep4,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Output,Output.rep1,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Output,Output.rep2,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Output,Output.rep3,norm,hg19
TMPRA_K562_OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Output,Output.rep4,norm,hg19
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep1,raw,hg19
TMPRA_K562_OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed.gz,MPRA_Tiling_K562_Tewhey_Hannah,TMPRA_K562_OL13_20220512,Input,Input.rep2,raw,hg19


Save results

In [10]:
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, TXT_FOLDER, "summary")
txt_fname = "metadata.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

txt = paste("mkdir -p", txt_fdiry)
system(txt)

dat = dat_metadata_tmpra
write_tsv(dat, txt_fpath)