**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config_sing.R")))
suppressMessages(suppressWarnings(library("GenomicRanges")))
suppressMessages(suppressWarnings(library("rtracklayer")))
suppressMessages(suppressWarnings(library("BSgenome.Hsapiens.UCSC.hg38")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /mount/work 
PATH OF SOURCE:     /mount/work/source 
PATH OF EXECUTABLE: /mount/work/exe 
PATH OF ANNOTATION: /mount/work/annotation 
PATH OF PROJECT:    /mount/project 
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc 


### Import data

**Import ASTARR data**

In [2]:
### set variables
FDIRY_ASTARR   = "KS91_K562_ASTARRseq"
PREFIX_ASTARR  = "KS91_K562_hg38_ASTARRseq"
REGIONS_ASTARR = c("GATA1", "MYC", "FADS")

### loop regions to import data
lst_dat_astarr = lapply(REGIONS_ASTARR, function(REGION){
    
    ### set file path (ex: KS91_K562_hg38_ASTARRseq_Ratio.GATA1.tsv)
    fdiry = file.path(FD_RES, FDIRY_ASTARR, "coverage")
    fname = paste0(PREFIX_ASTARR, "_", "Ratio", ".", REGION, ".tsv")
    fpath = file.path(fdiry, fname)
    
    ### show progress
    print(fname); flush.console()
    
    ### import data
    dat = read_tsv(fpath, show_col_types = FALSE)
    return(dat)
}) # end lapply

names(lst_dat_astarr) = REGIONS_ASTARR

[1] "KS91_K562_hg38_ASTARRseq_Ratio.GATA1.tsv"
[1] "KS91_K562_hg38_ASTARRseq_Ratio.MYC.tsv"
[1] "KS91_K562_hg38_ASTARRseq_Ratio.FADS.tsv"


**Import TMPRA data**

In [3]:
fdiry = file.path(FD_RES, "Tewhey_K562_TileMPRA", "coverage")
fname = "Tile_K562_hg38_*_Ratio.*.perbase.tsv"
fglob = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)

lst_dat_tmpra = lapply(fpaths, function(fpath){
    ### show progress
    fname = basename(fpath)
    print(fname); flush.console()
    
    ### import data
    dat = read_tsv(fpath, show_col_types = FALSE)
    return(dat)
})

[1] "Tile_K562_hg38_20200905_Ratio.FADS.stranded_neg.perbase.tsv"
[1] "Tile_K562_hg38_20200905_Ratio.FADS.stranded_pos.perbase.tsv"
[1] "Tile_K562_hg38_20210130_Ratio.GATA1.stranded_pos.perbase.tsv"
[1] "Tile_K562_hg38_20210130_Ratio.MYC.stranded_pos.perbase.tsv"


In [4]:
FDIRY_TMPRA   = "Tewhey_K562_TileMPRA" 
REGIONS_TMPRA = c("FADS", "FADS", "GATA1", "MYC")
STRANDS_TMPRA = c(
    "stranded_neg", 
    "stranded_pos",
    "stranded_pos",
    "stranded_pos")
PREFIXS_TMPRA = c(
    "Tile_K562_hg38_20200905",
    "Tile_K562_hg38_20200905",
    "Tile_K562_hg38_20210130",
    "Tile_K562_hg38_20210130")

## ASTARR

In [7]:
REGIONS = REGIONS_ASTARR
STRANDS = c("unstranded", "stranded_pos", "stranded_neg")
FDIRY   = FDIRY_ASTARR
PREFIX  = PREFIX_ASTARR
PCOUNT  = 1
GENOME  = BSgenome.Hsapiens.UCSC.hg38

for (REGION in REGIONS){
    cat("============================\n")
    cat(REGION, "\n"); flush.console()
    
    ### get data
    dat_cov = lst_dat_astarr[[REGION]]
    
    ### preprocess
    dat_cov = dat_cov %>% 
        mutate(pLog2FC = log2((Output+PCOUNT) / (Input+PCOUNT)))

    dat_lfc = dat_cov %>% 
        dplyr::filter(Input  != 0) %>% 
        dplyr::filter(Output != 0) %>% 
        mutate(
            FC     = Output / Input,
            Log2FC = log2(Output / Input))
    
    ### show some stats
    cnt1 = sum(dat_cov$Input  == 0)
    cnt2 = sum(dat_cov$Output == 0)
    cnt3 = sum((dat_cov$Input == 0) & (dat_cov$Output == 0))
    cat("#{Input  == 0}:", cnt1, "\n")
    cat("#{Output == 0}:", cnt2, "\n")
    cat("#{Both   == 0}:", cnt3, "\n")
        
    for (STRAND in STRANDS){
        ### set strand and show progress
        cat("   ", "++++++++++++++++++++++++++++\n")
        if (STRAND == "unstranded"){  idn_strand = "*"}
        if (STRAND == "stranded_pos"){idn_strand = "+"}
        if (STRAND == "stranded_neg"){idn_strand = "-"}
        cat("   ",STRAND, ":", idn_strand, "\n"); flush.console()
    
        ### explort track for input/output and pseudo Log2FC
        dat = dat_cov %>% dplyr::filter(Strand == STRAND)
        grg = GRanges(
            seqnames   = dat$Chrom,               
            ranges     = IRanges(
                start  = dat$Start+1,
                end    = dat$End), 
            strand     = idn_strand,
            seqlengths = seqlengths(GENOME))
        genome(grg) = "hg38"
        
        for (SCORE in c("Input", "Output", "pLog2FC")) {
            ### show progress
            cat("   ", "   ", "Score:", SCORE, "\n"); flush.console()
            
            ### assign scores
            mcols(grg)$score = dat[[SCORE]]
            
            ### export track as bigwig
            fdiry = file.path(FD_RES, FDIRY, "coverage")
            fname = paste0(PREFIX, "_track", ".", REGION, ".", STRAND, ".", SCORE, ".bw")
            fpath = file.path(fdiry, fname)
            
            cat("   ", "   ", "Export:", fname, "\n\n")
            export(grg, fpath, format = "BigWig")
        } # end loop score
        
        ### explort track for fold change
        dat = dat_lfc %>% dplyr::filter(Strand == STRAND)
        grg = GRanges(
            seqnames   = dat$Chrom,               
            ranges     = IRanges(
                start  = dat$Start+1,
                end    = dat$End), 
            strand     = idn_strand,
            seqlengths = seqlengths(GENOME))
        genome(grg) = "hg38"
        
        for (SCORE in c("FC", "Log2FC")) {
            ### show progress
            cat("   ", "   ", "Score:", SCORE, "\n"); flush.console()
            
            ### assign scores
            mcols(grg)$score = dat[[SCORE]]
            
            ### export track as bigwig
            fdiry = file.path(FD_RES, FDIRY, "coverage")
            fname = paste0(PREFIX, "_track", ".", REGION, ".", STRAND, ".", SCORE, ".bw")
            fpath = file.path(fdiry, fname)
            
            cat("   ", "   ", "Export:", fname, "\n\n")
            export(grg, fpath, format = "BigWig")
        } # end loop score
        
    } # end loop STRANDS
} # end loop REGIONS

GATA1 
#{Input  == 0}: 919231 
#{Output == 0}: 1270067 
#{Both   == 0}: 916666 
    ++++++++++++++++++++++++++++
    unstranded : * 
        Score: Input 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.unstranded.Input.bw 

        Score: Output 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.unstranded.Output.bw 

        Score: pLog2FC 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.unstranded.pLog2FC.bw 

        Score: FC 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.unstranded.FC.bw 

        Score: Log2FC 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.unstranded.Log2FC.bw 

    ++++++++++++++++++++++++++++
    stranded_pos : + 
        Score: Input 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.stranded_pos.Input.bw 

        Score: Output 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.stranded_pos.Output.bw 

        Score: pLog2FC 
        Export: KS91_K562_hg38_ASTARRseq_track.GATA1.stranded_pos.pLog2FC.bw 

        Score: FC 
   

## TMPRA

In [9]:
FDIRY   = FDIRY_TMPRA
REGIONS = REGIONS_TMPRA
STRANDS = STRANDS_TMPRA
PREFIXS = PREFIXS_TMPRA
PCOUNT  = 1000
GENOME  = BSgenome.Hsapiens.UCSC.hg38

for (idx in seq_along(lst_dat_tmpra)){
    ###
    REGION  = REGIONS[idx]
    STRAND  = STRANDS[idx]
    PREFIX  = PREFIXS[idx]
    if (STRAND == "unstranded"){  idn_strand = "*"}
    if (STRAND == "stranded_pos"){idn_strand = "+"}
    if (STRAND == "stranded_neg"){idn_strand = "-"}
    
    ### preprocess
    dat_cov = lst_dat_tmpra[[idx]]
    dat_cov = dat_cov %>% 
        mutate(pLog2FC = log2((Output+PCOUNT) / (Input+PCOUNT)))

    dat_lfc = dat_cov %>% 
        dplyr::filter(Input  != 0) %>% 
        dplyr::filter(Output != 0) %>% 
        mutate(
            FC     = Output / Input,
            Log2FC = log2(Output / Input))
    
    ### show progress & stats
    cat("============================\n")
    cat(REGION, "->", STRAND, ":", idn_strand, "\n"); flush.console()
    cnt1 = sum(dat_cov$Input  == 0)
    cnt2 = sum(dat_cov$Output == 0)
    cnt3 = sum((dat_cov$Input == 0) & (dat_cov$Output == 0))
    cat("#{Input  == 0}:", cnt1, "\n")
    cat("#{Output == 0}:", cnt2, "\n")
    cat("#{Both   == 0}:", cnt3, "\n")
    
    ### explort track for input/output and pseudo Log2FC
    dat = dat_cov
    grg = GRanges(
        seqnames   = dat$Chrom,               
        ranges     = IRanges(
            start  = dat$Start+1,
            end    = dat$End), 
        strand     = idn_strand,
        seqlengths = seqlengths(GENOME))
    genome(grg) = "hg38"

    ###
    for (SCORE in c("Input", "Output", "pLog2FC")) {
        ### show progress
        cat("   ", "Score:", SCORE, "\n"); flush.console()

        ### assign scores
        mcols(grg)$score = dat[[SCORE]]

        ### export track as bigwig
        fdiry = file.path(FD_RES, FDIRY, "coverage")
        fname = paste0(PREFIX, "_track", ".", REGION, ".", STRAND, ".", SCORE, ".bw")
        fpath = file.path(fdiry, fname)
        
        cat("   ", "Export:", fname, "\n\n")
        export(grg, fpath, format = "BigWig")
    } # end loop score

    ### explort track for fold change
    dat = dat_lfc
    grg = GRanges(
        seqnames   = dat$Chrom,               
        ranges     = IRanges(
            start  = dat$Start+1,
            end    = dat$End), 
        strand     = idn_strand,
        seqlengths = seqlengths(GENOME))
    genome(grg) = "hg38"

    for (SCORE in c("FC", "Log2FC")) {
        ### show progress
        cat("   ", "Score:", SCORE, "\n"); flush.console()

        ### assign scores
        mcols(grg)$score = dat[[SCORE]]

        ### export track as bigwig
        fdiry = file.path(FD_RES, FDIRY, "coverage")
        fname = paste0(PREFIX, "_track", ".", REGION, ".", STRAND, ".", SCORE, ".bw")
        fpath = file.path(fdiry, fname)
        
        cat("   ", "Export:", fname, "\n\n")
        export(grg, fpath, format = "BigWig")
    } # end loop score

} # end loop

FADS -> stranded_neg : - 
#{Input  == 0}: 0 
#{Output == 0}: 0 
#{Both   == 0}: 0 
    Score: Input 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_neg.Input.bw 

    Score: Output 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_neg.Output.bw 

    Score: pLog2FC 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_neg.pLog2FC.bw 

    Score: FC 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_neg.FC.bw 

    Score: Log2FC 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_neg.Log2FC.bw 

FADS -> stranded_pos : + 
#{Input  == 0}: 0 
#{Output == 0}: 5 
#{Both   == 0}: 0 
    Score: Input 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_pos.Input.bw 

    Score: Output 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_pos.Output.bw 

    Score: pLog2FC 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_pos.pLog2FC.bw 

    Score: FC 
    Export: Tile_K562_hg38_20200905_track.FADS.stranded_pos.FC.bw 

    Score: Log2FC 
    Export: 