**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
suppressWarnings(suppressMessages(library("GenomicRanges")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



**Set global variables**

In [2]:
TXT_ASSAY = "CRISPRi_FlowFISH_K562_Riley_JinWoo"

## Import data

In [3]:
### set directory
txt_assay  = TXT_ASSAY
txt_folder = "guide_scores"
txt_fdiry  = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
txt_fname  = "CRISPRi_HCRFF_K562.hg38.ZScore.unstranded.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
vec_txt_cname = c("Chrom", "ChromStart", "ChromEnd", "Name", "Score", "Strand", "Target")
dat = read_tsv(
    txt_fpath, 
    col_names = vec_txt_cname, 
    show_col_types = FALSE)

### assign and show
dat_guide_score_chcrff = dat
fun_display_table(head(dat))

Chrom,ChromStart,ChromEnd,Name,Score,Strand,Target
chr1,74582275,74582276,id-1,0.414014,.,CD164
chr1,74582275,74582276,id-1,-0.493091,.,MYC
chr1,74582275,74582276,id-1,0.024514,.,NMU
chr1,74582275,74582276,id-1,0.44089,.,PVT1
chr1,74582309,74582310,id-2,-0.181799,.,CD164
chr1,74582309,74582310,id-2,0.997649,.,MYC


## Get genomic ranges covered by the assay

**Create Granges object of the guides**

In [4]:
### construct a granges object of the guides
dat = dat_guide_score_chcrff
grg = GRanges(
    seqnames = dat$Chrom,
    ranges   = IRanges(
        start = dat$ChromStart, 
        end   = dat$ChromEnd
    )
)
grg$Target = dat$Target

### assign and show
grg_guide_total = grg
print(grg)

GRanges object with 656178 ranges and 1 metadata column:
           seqnames              ranges strand |      Target
              <Rle>           <IRanges>  <Rle> | <character>
       [1]     chr1   74582275-74582276      * |       CD164
       [2]     chr1   74582275-74582276      * |         MYC
       [3]     chr1   74582275-74582276      * |         NMU
       [4]     chr1   74582275-74582276      * |        PVT1
       [5]     chr1   74582309-74582310      * |       CD164
       ...      ...                 ...    ... .         ...
  [656174]     chrX 151922454-151922455      * |        PVT1
  [656175]     chrX 151922521-151922522      * |       CD164
  [656176]     chrX 151922521-151922522      * |         MYC
  [656177]     chrX 151922521-151922522      * |         NMU
  [656178]     chrX 151922521-151922522      * |        PVT1
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths


**Merge overlapped guides**

In [5]:
grg = grg_guide_total
grg = reduce(grg)

### assign and show
grg_guide_reduced = grg
print(grg)

GRanges object with 271695 ranges and 0 metadata columns:
           seqnames              ranges strand
              <Rle>           <IRanges>  <Rle>
       [1]     chr1   74582275-74582276      *
       [2]     chr1   74582309-74582310      *
       [3]     chr1   80526338-80526339      *
       [4]     chr1 102025739-102025740      *
       [5]     chr1 102026463-102026464      *
       ...      ...                 ...    ...
  [271691]     chrX 149651713-149651714      *
  [271692]     chrX 149652575-149652576      *
  [271693]     chrX 151921813-151921814      *
  [271694]     chrX 151922454-151922455      *
  [271695]     chrX 151922521-151922522      *
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths


**Get the ranges for each gene**

In [6]:
### get the range for each gene
### since some genes are screening at the same regions
grg = grg_guide_total
lst = split(grg, grg$Target)
lst = lapply(lst, function(grg){
    grg = range(reduce(grg))
    return(grg)
})
grg = unlist(as(lst, "GRangesList"))

grg_guide_range_reduced_by_gene = grg
print(grg)

GRanges object with 187 ranges and 0 metadata columns:
          seqnames              ranges strand
             <Rle>           <IRanges>  <Rle>
  CAPRIN1    chr10 100694991-100851987      *
  CAPRIN1    chr11   33064196-61797942      *
  CAPRIN1     chr5   35617439-35670158      *
      CAT    chr10 100694991-100851987      *
      CAT    chr11   33064196-61797942      *
      ...      ...                 ...    ...
     PVT1     chr6  47776773-142078146      *
     PVT1     chr7   9707775-156331016      *
     PVT1     chr8   4306951-139535919      *
     PVT1     chr9  24548731-103913441      *
     PVT1     chrX  14261425-151922522      *
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths


**Get the ranges of total guides**

In [8]:
grg = grg_guide_total
grg = range(reduce(grg))

grg_guide_range_reduced = grg
print(grg)

GRanges object with 21 ranges and 0 metadata columns:
       seqnames             ranges strand
          <Rle>          <IRanges>  <Rle>
   [1]     chr1 74582275-248233868      *
   [2]    chr10  2350909-130456455      *
   [3]    chr11  4091884-124027721      *
   [4]    chr12 11520058-128194750      *
   [5]    chr13 19040910-105391550      *
   ...      ...                ...    ...
  [17]     chr6 47776773-142078146      *
  [18]     chr7  9707775-156331016      *
  [19]     chr8  4306951-139535919      *
  [20]     chr9 24548731-103913441      *
  [21]     chrX 14261425-151922522      *
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths


## Label the screened regions by targets

In [10]:
### init
grg_query   = grg_guide_range_reduced
grg_subject = grg_guide_range_reduced_by_gene

### overlap, split and summarize
res = GenomicRanges::findOverlaps(grg_query, grg_subject)
lst = split(subjectHits(res), queryHits(res))
lst = lapply(lst, function(idx){
    grg = grg_subject[idx]
    txt = sort(unique(names(grg)))
    txt = paste(txt, collapse = ",")
    return(txt)
})
grg_query$Name = unlist(lst)

grg_guide_range_label = grg_query
grg_query

GRanges object with 21 ranges and 1 metadata column:
       seqnames             ranges strand |                   Name
          <Rle>          <IRanges>  <Rle> |            <character>
   [1]     chr1 74582275-248233868      * | CD164,HBE1,HBG1,HBG2..
   [2]    chr10  2350909-130456455      * | CAPRIN1,CAT,CD164,ER..
   [3]    chr11  4091884-124027721      * | CAPRIN1,CAT,CD164,ER..
   [4]    chr12 11520058-128194750      * | CD164,ERP29,GATA1,HB..
   [5]    chr13 19040910-105391550      * | CD164,HBE1,HBG1,HBG2..
   ...      ...                ...    ... .                    ...
  [17]     chr6 47776773-142078146      * | CD164,HBE1,HBG1,HBG2..
  [18]     chr7  9707775-156331016      * |     CD164,MYC,NMU,PVT1
  [19]     chr8  4306951-139535919      * |     CD164,MYC,NMU,PVT1
  [20]     chr9 24548731-103913441      * |     CD164,MYC,NMU,PVT1
  [21]     chrX 14261425-151922522      * | CD164,GATA1,HDAC6,MY..
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths

**Convert to dataframe**

In [11]:
grg = grg_guide_range_label
dat = data.frame(
    Chrom      = as.character(seqnames(grg)),
    ChromStart = start(ranges(grg)),
    ChromEnd   = end(ranges(grg)),
    Target     = grg$Name
)
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_guide_range_label = dat
print(dim(dat))
fun_display_table(dat)

[1] 21  4


Chrom,ChromStart,ChromEnd,Target
chr1,74582275,248233868,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr10,2350909,130456455,"CAPRIN1,CAT,CD164,ERP29,HBE1,HBG1,HBG2,HBS1L,LMO2,MEF2C,MYB,MYC,NMU,PVT1"
chr11,4091884,124027721,"CAPRIN1,CAT,CD164,ERP29,FADS1,FADS2,FADS3,FEN1,HBE1,HBG1,HBG2,HBS1L,LMO2,MEF2C,MYB,MYC,NMU,PVT1"
chr12,11520058,128194750,"CD164,ERP29,GATA1,HBE1,HBG1,HBG2,HBS1L,HDAC6,MYB,MYC,NMU,PVT1"
chr13,19040910,105391550,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr14,27400449,97170083,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr15,19960785,87166251,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr16,46501260,63965316,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr17,22603780,71592262,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"
chr18,29747998,78327940,"CD164,HBE1,HBG1,HBG2,HBS1L,MYB,MYC,NMU,PVT1"


## Save results

In [12]:
### set directory
fdiry = file.path(FD_RES, "region", "fcc_coverage")
fname = "region_screened.crispri_hcrff.bed.gz"
fpath = file.path(fdiry, fname)
print(fpath)

### write table
dat = dat_guide_range_label
write_tsv(dat, fpath, col_names = FALSE)

[1] "/mount/repo/Proj_ENCODE_FCC/results/region/fcc_coverage/region_screened.crispri_hcrff.bed.gz"


In [13]:
### set directory
fdiry = file.path(FD_RES, "region", "fcc_coverage")
fname = "region_screened.crispri_hcrff.tsv"
fpath = file.path(fdiry, fname)
print(fpath)

### write table
dat = dat_guide_range_label
write_tsv(dat, fpath, col_names = TRUE)

[1] "/mount/repo/Proj_ENCODE_FCC/results/region/fcc_coverage/region_screened.crispri_hcrff.tsv"


In [20]:
grg = grg_range_reduced_by_gene
slotNames(grg)

In [28]:
as.character(grg@seqnames)

In [39]:
grg = grg_range_reduced
dat = as.data.frame(grg@ranges)
dat = dat %>% 
    dplyr::mutate(
        Chrom      = as.character(grg@seqnames),
        ChromStart = `start`,
        ChromEnd   = `end`) %>%
    dplyr::select(Chrom, ChromStart, ChromEnd)
dat

Chrom,ChromStart,ChromEnd
<chr>,<int>,<int>
chr1,74582275,248233868
chr10,2350909,130456455
chr11,4091884,124027721
chr12,11520058,128194750
chr13,19040910,105391550
chr14,27400449,97170083
chr15,19960785,87166251
chr16,46501260,63965316
chr17,22603780,71592262
chr18,29747998,78327940


In [35]:
grg = grg_range_reduced_by_gene
dat = as.data.frame(grg@ranges)
dat = dat %>% 
    dplyr::mutate(
        Chrom      = as.character(grg@seqnames),
        ChromStart = `start`,
        ChromEnd   = `end`,
        Target     = `names`) %>%
    dplyr::select(Chrom, ChromStart, ChromEnd, Target)
dat

Chrom,ChromStart,ChromEnd,Target
<chr>,<int>,<int>,<chr>
chr10,100694991,100851987,CAPRIN1
chr11,33064196,61797942,CAPRIN1
chr5,35617439,35670158,CAPRIN1
chr10,100694991,100851987,CAT
chr11,33064196,61797942,CAT
chr5,35617439,35670158,CAT
chr1,74582275,248233868,CD164
chr10,2350909,130456455,CD164
chr11,5996385,124027720,CD164
chr12,11520058,128194749,CD164


In [None]:
chr11	4091884	6419310	HBE1,HBG1,HBG2,HBS1L,MYB
chr11	33064196	34664117	CAPRIN1,CAT,LMO2
chr11	61788524	61897153	FADS1,FADS2,FADS3,FEN1
chr12	54300766	54301042	GATA1,HDAC6
chr12	110726151	112325737	ERP29
chr19	12887110	12887237	GATA1,HDAC6
chr3	128487571	128487937	GATA1,HDAC6
chr4	55237590	56068531	NMU
chr5	87611207	89911163	MEF2C
chr6	108840570	109828800	CD164
chr6	134253831	136927585	HBE1,HBG1,HBG2,HBS1L,MYB
chr8	126736094	128735225	MYC,PVT1
chrX	48397930	49656988	GATA1,HDAC6