**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



## Import data

**Import Accessibility**

In [2]:
txt_fdiry = file.path(
    FD_RES,
    "region_coverage_fcc",
    "fcc_astarr_macs_input_overlap", 
    "STARR_ATAC_K562_Reddy_KS91",
    "overlap_score",
    "summary"
)

txt_fname = "result.coverage.TPM.FPKM.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)

dat_region_astarr = dat
print(dim(dat))
head(dat)

[1] 150041      8


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.004164383,3.940038,0.0007357323,0.7181993
chr1,14282,14614,chr1:14282-14614,0.003003325,2.841707,0.0022620843,2.2104314
chr1,16025,16338,chr1:16025-16338,0.004048701,3.830812,0.0012867327,1.2597204
chr1,17288,17689,chr1:17288-17689,0.006551198,6.198372,0.0017372529,1.7059186
chr1,28934,29499,chr1:28934-29499,0.004295316,4.064322,0.0006561996,0.6447721
chr1,115429,115969,chr1:115429-115969,0.015954822,15.096518,0.0282829448,27.6549997


**Import GC content**

In [3]:
txt_fdiry = file.path(
    FD_RES,
    "region_nuc",
    "fcc_astarr_macs"
)

txt_fname = "K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)
dat = dat %>% dplyr::select(
    `#1_usercol`,
    `2_usercol`,
    `3_usercol`,
    `5_pct_gc`,
    `12_seq_len`
)
colnames(dat) = c("Chrom", "ChromStart", "ChromEnd", "GC", "Length")

dat = dat %>% dplyr::mutate(Region = fun_gen_region(Chrom, ChromStart, ChromEnd)) 

dat_region_pgc = dat
print(dim(dat))
head(dat)

[1] 150042      6


Chrom,ChromStart,ChromEnd,GC,Length,Region
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
chr1,10038,10405,0.523161,367,chr1:10038-10405
chr1,14282,14614,0.578313,332,chr1:14282-14614
chr1,16025,16338,0.587859,313,chr1:16025-16338
chr1,17288,17689,0.625935,401,chr1:17288-17689
chr1,28934,29499,0.771681,565,chr1:28934-29499
chr1,115429,115969,0.381481,540,chr1:115429-115969


**Import annotation matrix**

In [4]:
txt_fdiry = file.path(
    FD_RES, 
    "region_annotation", 
    "fcc_astarr_macs_input_overlap",
    "summary"
)
#txt_fname = "matrix.annotation.merge.chipseq_full.fcc.tss.tsv"
txt_fname = "matrix.annotation.prepare.full.chipseq_full.only.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = read_tsv(txt_fpath, show_col_types = FALSE)

mat_region_annot = dat
print(dim(dat))
head(dat)

[1] 139130    526


Region,ADNP,AFF1,AGO1,ARHGAP35,ARID1B,ARID2,ARID3A,ARID3B,ARID4B,⋯,ZNF778,ZNF780A,ZNF785,ZNF79,ZNF83,ZNF830,ZNF84,ZSCAN29,ZSCAN32,ZZZ3
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:16025-16338,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:137748-138049,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:138321-139517,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:186343-187136,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:605104-605675,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:778233-779389,0,1,1,0,1,0,0,0,1,⋯,0,0,0,0,0,0,0,1,0,0


## Merge

In [7]:
dat = dplyr::inner_join(
    dat_region_pgc,
    dat_region_astarr,
    by = c("Chrom", "ChromStart", "ChromEnd", "Region")
)

dat = dat %>%
    dplyr::mutate(
        pLog10_Signal = log10(Input_TPM + 1),
        pLog10_Length = log10(Length + 1),
        pLog10_GC     = log10(GC + 1),
    ) %>%
    dplyr::select(
        Chrom, ChromStart, ChromEnd, Region,
        pLog10_Signal, 
        pLog10_Length, 
        pLog10_GC,
        GC
    )

dat = dat %>% dplyr::left_join(
    mat_region_annot,
    by = "Region"
    #by = c("Chrom", "ChromStart", "ChromEnd", "Region")
)
dat = dat %>% replace(is.na(.), 0)

dat_region_merge = dat
print(dim(dat))
head(dat)

[1] 150041    533


Chrom,ChromStart,ChromEnd,Region,pLog10_Signal,pLog10_Length,pLog10_GC,GC,ADNP,AFF1,⋯,ZNF778,ZNF780A,ZNF785,ZNF79,ZNF83,ZNF830,ZNF84,ZSCAN29,ZSCAN32,ZZZ3
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.6937303,2.565848,0.1827458,0.523161,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1,14282,14614,chr1:14282-14614,0.5845243,2.522444,0.1981931,0.578313,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1,16025,16338,chr1:16025-16338,0.6840201,2.49693,0.2008119,0.587859,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1,17288,17689,chr1:17288-17689,0.8572343,2.604226,0.2111032,0.625935,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1,28934,29499,chr1:28934-29499,0.7045213,2.752816,0.2483855,0.771681,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1,115429,115969,chr1:115429-115969,1.2067319,2.733197,0.1403449,0.381481,0,1,⋯,0,0,0,0,0,0,0,1,0,0


## Save results

In [8]:
txt_fdiry = file.path(
    FD_RES, 
    "region_integration", 
    "fcc_astarr_macs_input_overlap"
)
#txt_fname = "matrix.annotation.merge.chipseq_full.fcc.tss.tsv"
txt_fname = "matrix.annotation.merge.full.chipseq_full.nuc.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region_merge
write_tsv(dat, txt_fpath)