**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



**Set global variable**

In [2]:
TXT_FOLDER_INP = "genome_tss"
TXT_FOLDER_OUT = "fcc_table"

## Import data

In [3]:
### set file path
txt_folder = TXT_FOLDER_INP
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "description.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_cnames = dat
fun_display_table(dat)

Name,Note
Chrom,Name of the chromosome
ChromStart,The starting position of the feature in the chromosome
ChromEnd,The ending position of the feature in the chromosome
Name,Gene name
Score,"highest level of pol2 chip seq (ENCFF914WIS.bigWig) at [TSS-500, TSS+500] among TSS isoform"
Group,Type of Annotation
Label,Label of Annotation


In [4]:
### set file path
txt_folder = TXT_FOLDER_INP
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fname  = "K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
vec = dat_cnames$Name
dat = read_tsv(txt_fpath, col_names = vec, show_col_types = FALSE)

### assign and show
dat_region_import = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 11892     7


Chrom,ChromStart,ChromEnd,Name,Score,Group,Label
chr1,29370,29371,WASH7P,0.00023,TSS,TSS_Pol2_RNAseq
chr1,827522,827523,LINC00115,64.4656,TSS,TSS_Pol2_RNAseq
chr1,827590,827591,LINC01128,64.4603,TSS,TSS_Pol2_RNAseq


## Arrange table

In [5]:
### get table
dat = dat_region_import
vec = c(
    "Chrom", "ChromStart", "ChromEnd", "Group", "Label",
    "Assay", "Region", "Target", "Score", "NLog10P",
    "Method", "Source"
)

dat = dat %>% 
    dplyr::mutate(
        Group   = "TSS",
        Label   = paste(Group, Name, sep = ":"),
        Assay   = "TSS",
        Region  = fun_gen_region(Chrom, ChromStart, ChromEnd),
        Target  = Name,
        NLog10P = NA,
        Method  = "POLR2:ENCFF914WIS;RNAseq:ENCFF421TJX",
        Source  = "RefGene"
    ) %>%
    dplyr::select(!!!vec)

dat_region_arrange = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 11892    12


Chrom,ChromStart,ChromEnd,Group,Label,Assay,Region,Target,Score,NLog10P,Method,Source
chr1,29370,29371,TSS,TSS:WASH7P,TSS,chr1:29370-29371,WASH7P,0.00023,,POLR2:ENCFF914WIS;RNAseq:ENCFF421TJX,RefGene
chr1,827522,827523,TSS,TSS:LINC00115,TSS,chr1:827522-827523,LINC00115,64.4656,,POLR2:ENCFF914WIS;RNAseq:ENCFF421TJX,RefGene
chr1,827590,827591,TSS,TSS:LINC01128,TSS,chr1:827590-827591,LINC01128,64.4603,,POLR2:ENCFF914WIS;RNAseq:ENCFF421TJX,RefGene


## Export results

In [7]:
### set file path
txt_folder = TXT_FOLDER_OUT
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fname  = "K562.hg38.tss.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### set table
dat = dat_region_arrange
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)

### write table
write_tsv(dat, txt_fpath, col_names = FALSE)

In [17]:
fun = function(Region, Gene, Score, Method){
    lst = list(
        "Label"  = "TSS",
        "Region" = Region,
        "Gene"   = Gene,
        "Score"  = Score,
        "Method" = Method
    )
    txt = jsonlite::toJSON(lst, auto_unbox=TRUE)
    txt = as.character(txt)
    return(txt)
}

In [24]:
### get table
dat = dat_region_import
dat = dat %>% 
    dplyr::mutate(
        Region = fun_gen_region(Chrom, ChromStart, ChromEnd),
        Method = "Pol2A,RNAseq",
        Group  = "TSS", 
        Label  = paste(Group, Name, sep = ":")
    )

dat = dat %>% 
    dplyr::rowwise() %>% 
    dplyr::mutate(
        Note = fun(Region, Name, Score, Method)
    ) %>%
    dplyr::ungroup()

dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_arrange = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 11892    10


Chrom,ChromStart,ChromEnd,Name,Score,Group,Label,Region,Method,Note
chr1,29370,29371,WASH7P,0.00023,TSS,TSS:WASH7P,chr1:29370-29371,"Pol2A,RNAseq","{""Label"":""TSS"",""Region"":""chr1:29370-29371"",""Gene"":""WASH7P"",""Score"":0.0002,""Method"":""Pol2A,RNAseq""}"
chr1,827522,827523,LINC00115,64.4656,TSS,TSS:LINC00115,chr1:827522-827523,"Pol2A,RNAseq","{""Label"":""TSS"",""Region"":""chr1:827522-827523"",""Gene"":""LINC00115"",""Score"":64.4656,""Method"":""Pol2A,RNAseq""}"
chr1,827590,827591,LINC01128,64.4603,TSS,TSS:LINC01128,chr1:827590-827591,"Pol2A,RNAseq","{""Label"":""TSS"",""Region"":""chr1:827590-827591"",""Gene"":""LINC01128"",""Score"":64.4603,""Method"":""Pol2A,RNAseq""}"


In [26]:
dat = dat_region_arrange
vec = c("Chrom", "ChromStart", "ChromEnd", "Group", "Label", "Note")
dat = dat %>% dplyr::select(!!!vec)


dat_region_final = dat
print(dim(dat))
fun_display_table(head(dat, 3))

[1] 11892     6


Chrom,ChromStart,ChromEnd,Group,Label,Note
chr1,29370,29371,TSS,TSS:WASH7P,"{""Label"":""TSS"",""Region"":""chr1:29370-29371"",""Gene"":""WASH7P"",""Score"":0.0002,""Method"":""Pol2A,RNAseq""}"
chr1,827522,827523,TSS,TSS:LINC00115,"{""Label"":""TSS"",""Region"":""chr1:827522-827523"",""Gene"":""LINC00115"",""Score"":64.4656,""Method"":""Pol2A,RNAseq""}"
chr1,827590,827591,TSS,TSS:LINC01128,"{""Label"":""TSS"",""Region"":""chr1:827590-827591"",""Gene"":""LINC01128"",""Score"":64.4603,""Method"":""Pol2A,RNAseq""}"


In [27]:
### set file path
txt_folder = "fcc_table"
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fname  = "K562.hg38.TSS.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = dat_region_final
write_tsv(dat, txt_fpath, col_names = FALSE)