**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



**Set global variables**

In [2]:
TXT_FOLDER_REGION = "encode_chipseq_tf_full"

## Import metadata

**View files**

In [4]:
txt_fdiry = file.path(FD_RES, "region", TXT_FOLDER_REGION)
vec = dir(txt_fdiry)
cat("Count =", length(vec), "\n")

vec = head(vec)
for(txt in vec){cat(txt, "\n")}

Count = 721 
ENCFF003LPE.bed.gz 
ENCFF004HXL.bed.gz 
ENCFF004YCK.bed.gz 
ENCFF005MBI.bed.gz 
ENCFF009RFC.bed.gz 
ENCFF014HYS.bed.gz 


**Import data**

In [5]:
### set file directory
txt_fdiry = file.path(FD_RES, "region", TXT_FOLDER_REGION, "summary")
txt_fname = "metadata.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_metadata_info = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 720  14


Assay,Index_Experiment,Index_File,File_Format,File_Type,Output_Type,Genome,Target,Bio_Replicates,Analysis,Analysis_Version,md5sum,File_Name,File_URL
TF ChIP-seq,ENCSR590FLL,ENCFF843NBV,bed narrowPeak,bed,IDR thresholded peaks,GRCh38,ZNF347,"1, 2",ENCODE4 v1.4.0 GRCh38,ENCODE4,cab83f2be2220d6d328bd6566fc54d85,ENCFF843NBV.bed.gz,https://www.encodeproject.org/files/ENCFF843NBV/@@download/ENCFF843NBV.bed.gz
TF ChIP-seq,ENCSR536CBU,ENCFF588CXX,bed narrowPeak,bed,IDR thresholded peaks,GRCh38,ZBTB9,"1, 2",ENCODE4 v1.4.0 GRCh38,ENCODE4,c4faaf3e805d2fcfd7c9037fdb768174,ENCFF588CXX.bed.gz,https://www.encodeproject.org/files/ENCFF588CXX/@@download/ENCFF588CXX.bed.gz
TF ChIP-seq,ENCSR436CZV,ENCFF142MJD,bed narrowPeak,bed,IDR thresholded peaks,GRCh38,AFF4,"1, 2",ENCODE4 v1.8.0 GRCh38,ENCODE4,c8f76e772192439a7847dbbea9f73f40,ENCFF142MJD.bed.gz,https://www.encodeproject.org/files/ENCFF142MJD/@@download/ENCFF142MJD.bed.gz


**Check data**

In [6]:
### get file name from the folder
txt_fdiry = file.path(FD_RES, "region", TXT_FOLDER_REGION)
txt_fname = "*.bed.gz"
txt_fglob = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

vec1 = sort(vec_txt_fname)

### get file name from the table
dat = dat_metadata_info
vec = dat$File_Name

vec2 = sort(vec)

### check if matched in metadata
all(vec1 == vec2)

## Define column description
The peak file is in narrowPeak format, which is a standard six field bed with four additional fields (BED6+4 format)

In [7]:
### ENCODE narrowPeak: Narrow (or Point-Source) Peaks format
dat = tribble(
    ~Name,        ~Note,
    "Chrom",      "Name of the chromosome",
    "ChromStart", "The starting position of the feature in the chromosome",
    "ChromEnd",   "The ending position of the feature in the chromosome",
    "Name",       "Name given to a region; Use '.' if no name is assigned.",
    "Score",      "Indicates how dark the peak will be displayed in the browser (0-1000).",
    "Strand",     "+/- to denote strand or orientation. Use '.' if no orientation is assigned.",
    "SignalValue","Measurement of overall (usually, average) enrichment for the region.",
    "PValue",     "Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.",
    "QValue",     "Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.",
    "Peak",       "Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called."
)

### assign and show
dat_cname = dat
fun_display_table(dat)

Name,Note
Chrom,Name of the chromosome
ChromStart,The starting position of the feature in the chromosome
ChromEnd,The ending position of the feature in the chromosome
Name,Name given to a region; Use '.' if no name is assigned.
Score,Indicates how dark the peak will be displayed in the browser (0-1000).
Strand,+/- to denote strand or orientation. Use '.' if no orientation is assigned.
SignalValue,"Measurement of overall (usually, average) enrichment for the region."
PValue,Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
QValue,Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
Peak,Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.


## Define file labeling

**generate file label**

In [8]:
### get file name and generate label for each file
dat = dat_metadata_info
dat = dat %>% 
    dplyr::mutate(
        FName = File_Name,
        Label = paste("encode_chipseq", Target, Index_File, sep = "_")
    ) %>%
    dplyr::select(FName, Label)

### assign and show
dat_meta_label = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 720   2


FName,Label
ENCFF843NBV.bed.gz,encode_chipseq_ZNF347_ENCFF843NBV
ENCFF588CXX.bed.gz,encode_chipseq_ZBTB9_ENCFF588CXX
ENCFF142MJD.bed.gz,encode_chipseq_AFF4_ENCFF142MJD


**create region label table**

In [9]:
### set directory
txt_folder = TXT_FOLDER_REGION
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fglob  = file.path(txt_fdiry, "*bed*")

### get file names
vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

### init info table
dat = data.frame(
    "Folder" = txt_folder,
    "FName"  = vec_txt_fname,
    "FPath"  = vec_txt_fpath
)

### get label
dat = dat %>% 
    dplyr::left_join(dat_meta_label, by = "FName") %>%
    dplyr::select(Folder, FName, Label, FPath)

### assign and show
dat_region_label = dat
print(dim(dat))
fun_display_table(head(dat))

[1] 720   4


Folder,FName,Label,FPath
encode_chipseq_tf_full,ENCFF003LPE.bed.gz,encode_chipseq_RUNX1_ENCFF003LPE,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF003LPE.bed.gz
encode_chipseq_tf_full,ENCFF004HXL.bed.gz,encode_chipseq_FOSL1_ENCFF004HXL,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF004HXL.bed.gz
encode_chipseq_tf_full,ENCFF004YCK.bed.gz,encode_chipseq_ZNF740_ENCFF004YCK,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF004YCK.bed.gz
encode_chipseq_tf_full,ENCFF005MBI.bed.gz,encode_chipseq_ZNF584_ENCFF005MBI,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF005MBI.bed.gz
encode_chipseq_tf_full,ENCFF009RFC.bed.gz,encode_chipseq_GABPB1_ENCFF009RFC,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF009RFC.bed.gz
encode_chipseq_tf_full,ENCFF014HYS.bed.gz,encode_chipseq_ZNF84_ENCFF014HYS,/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/encode_chipseq_tf_full/ENCFF014HYS.bed.gz


## Save results

In [10]:
txt_folder = TXT_FOLDER_REGION
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "description.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_cname
write_tsv(dat, txt_fpath)

In [11]:
txt_folder = TXT_FOLDER_REGION
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "metadata.label.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_region_label
write_tsv(dat, txt_fpath)