**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


## Import TMPRA data
```
OL13 (FADS)
OL43 (GATA/MYC)
OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)
```

**Check data file paths**

In [2]:
fdiry = file.path(FD_RES, "source", "MPRA", "Tewhey_TMPRA", "tiling_counts")
print(dir(fdiry))

 [1] "FADS_tile_snp.20190214.attributes"       
 [2] "OL13_20220512_counts.out"                
 [3] "OL13_20220512_normalized_counts.out"     
 [4] "OL43_20211228_counts.out"                
 [5] "OL43_20211228_normalized_counts.out"     
 [6] "OL43_20221003_counts.out"                
 [7] "OL43_20221003_K562_normalized_counts.out"
 [8] "OL43_K562.bed"                           
 [9] "OL43.attributes"                         
[10] "OL45_20220927_counts.out"                
[11] "OL45_20220927_K562_normalized_counts.out"
[12] "OL45_K562.bed"                           
[13] "OL45.attributes"                         


In [3]:
fdiry  = file.path(FD_RES, "source", "MPRA", "Tewhey_TMPRA", "tiling_counts")
fpaths = dir(fdiry)
print(grep("out", fpaths, value=TRUE))

[1] "OL13_20220512_counts.out"                
[2] "OL13_20220512_normalized_counts.out"     
[3] "OL43_20211228_counts.out"                
[4] "OL43_20211228_normalized_counts.out"     
[5] "OL43_20221003_counts.out"                
[6] "OL43_20221003_K562_normalized_counts.out"
[7] "OL45_20220927_counts.out"                
[8] "OL45_20220927_K562_normalized_counts.out"


**Setup metadata**

In [4]:
dat_meta = data.frame(
    Dataset = c(
        "OL13_20220512", "OL13_20220512", 
        "OL43_20221003", "OL43_20221003", 
        "OL45_20220927", "OL45_20220927"),
    Process = c("raw", "norm", "raw", "norm", "raw", "norm"),
    Genome  = c("hg19", "hg19", "hg38", "hg38", "hg38", "hg38"),
    N_Rep_Input  = c(4, 4, 6, 6, 4, 4),  
    N_Rep_Output = c(4, 4, 5, 5, 4, 4), 
    FName = c(
        "OL13_20220512_counts.out",
        "OL13_20220512_normalized_counts.out",
        "OL43_20221003_counts.out",
        "OL43_20221003_K562_normalized_counts.out",
        "OL45_20220927_counts.out",
        "OL45_20220927_K562_normalized_counts.out")
)
dat_meta

Dataset,Process,Genome,N_Rep_Input,N_Rep_Output,FName
<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
OL13_20220512,raw,hg19,4,4,OL13_20220512_counts.out
OL13_20220512,norm,hg19,4,4,OL13_20220512_normalized_counts.out
OL43_20221003,raw,hg38,6,5,OL43_20221003_counts.out
OL43_20221003,norm,hg38,6,5,OL43_20221003_K562_normalized_counts.out
OL45_20220927,raw,hg38,4,4,OL45_20220927_counts.out
OL45_20220927,norm,hg38,4,4,OL45_20220927_K562_normalized_counts.out


**Read data**

In [5]:
lst_dat_read = lapply(1:nrow(dat_meta), function(idx){

    ### Extract
    xrow  = dat_meta[idx,]
    fname = xrow$FName
    n_rep_input  = xrow$N_Rep_Input
    n_rep_output = xrow$N_Rep_Output
    
    ### set file directory
    fdiry = file.path(FD_RES, "source", "MPRA", "Tewhey_TMPRA", "tiling_counts")
    fpath = file.path(fdiry, fname)
          
    ### read data
    dat = read.table(fpath, row.names=1)
    dat = dat %>% rownames_to_column(var = "Name")

    ### rename columns & assign
    cnames = c(
        "Name", 
        paste0("Input.rep",  1:n_rep_input), 
        paste0("Output.rep", 1:n_rep_output)
    )
    colnames(dat) = cnames
    
    ### show progress and return
    cat("\n=======================\n")
    cat(fname, "\n")
    cat("Shape:", dim(dat), "\n")
    print(head(dat, 3))
    flush.console()
    return(dat)
})

names(lst_dat_read) = paste(dat_meta$Dataset, dat_meta$Genome, dat_meta$Process, sep=".")


OL13_20220512_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)       1609       1221
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)       1179        582
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)       1066        643
  Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
1       1396        798         845         283         574        1223
2       1225        721         476         416         912         956
3       1206        681         470         846         540         847

OL13_20220512_normalized_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)   881.8510  1196.3976
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)   646.1792   570.2730
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)   584.2468   630.0439
  Input.rep3 Input.rep4

**Debug**

In [6]:
lst = lst_dat_read
print(names(lst))
length(lst)

[1] "OL13_20220512.hg19.raw"  "OL13_20220512.hg19.norm"
[3] "OL43_20221003.hg38.raw"  "OL43_20221003.hg38.norm"
[5] "OL45_20220927.hg38.raw"  "OL45_20220927.hg38.norm"


In [7]:
dat = lst[["OL43_20221003.hg38.raw"]]
dat = dat %>% 
    dplyr::mutate(Name = gsub("\\(|\\)", "", Name)) %>% 
    tidyr::separate_rows(Name, sep=";") %>%
    dplyr::filter(!grepl(pattern = "Nadav", Name))

head(dat)

Name,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Output.rep5
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
8:127381651-127381850,1318,1810,1477,1505,738,687,620,636,781,520,538
8:127320401-127320600,1318,1810,1477,1505,738,687,620,636,781,520,538
8:127742001-127742200,634,730,698,761,316,343,158,194,268,141,251
8:128045001-128045200,416,432,418,409,181,211,11164,6818,12118,9512,10450
8:128176951-128177150,1150,1271,1325,1255,597,580,14919,9299,28832,15790,19501
X:48782961-48783160,345,370,389,373,211,187,17116,10370,22126,16725,18361


In [8]:
dat = lst[["OL45_20220927.hg38.raw"]]
dat = dat %>% 
    dplyr::mutate(Name = gsub("\\(|\\)", "", Name)) %>% 
    tidyr::separate_rows(Name, sep=";") %>%
    dplyr::filter(!grepl(pattern = "Nadav|ORF", Name)) 

In [9]:
head(dat)

Name,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1:10437778:C:T:R:wC,645,437,491,493,2239,1698,1202,2532
1:10451799:C:T:R:wC,603,584,512,429,336,280,321,333
1:110198727:C:G:R:wC,681,589,628,550,46046,37138,33992,53823
1:110881742:NA:NA,930,795,874,765,24701,18211,17386,29053
1:111838694:C:T:R:wC,834,579,755,763,34228,34188,32379,37480
1:113047064:NA:NA,635,500,560,550,390,326,243,574


In [10]:
tail(dat)

Name,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
X:24819107:NA:NA,667,588,585,534,421,247,320,418
X:24893812:NA:NA,809,716,698,723,485,582,341,553
X:39110157:NA:NA,509,425,551,474,428,237,310,372
X:49154578:NA:NA,868,776,830,720,535,373,435,648
X:64815540:NA:NA,603,522,498,525,10737,9883,9705,13399
X:69674065:NA:NA,1461,1287,1267,1226,1172,1016,954,1365


In [11]:
pattern=":[0-9]*-[0-9]*"
pattern = "^[chr|0-9|X]+:[0-9]*-[0-9]*"
res = dat %>% dplyr::filter(str_detect(string=Name, pattern=pattern))
print(dim(res))

[1] 91110     9


In [12]:
head(res)

Name,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr11:32869701-32869900,298,270,296,317,533,442,378,894
chr11:32869801-32870000,454,339,430,426,361,252,262,356
chr11:32869901-32870100,359,282,259,307,129,74,95,85
chr11:32870001-32870200,447,366,485,371,266,227,177,226
chr11:32870101-32870300,397,352,358,306,2057,1104,879,1983
chr11:32870201-32870400,296,227,294,214,349,244,222,338


In [13]:
tail(res)

Name,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr20:58391001-58391200,1138,862,1125,1080,1122,752,760,1095
chr20:58391101-58391300,947,792,1005,925,1395,1118,1281,1631
chr20:58391201-58391400,684,619,659,624,455,326,380,577
chr20:58391301-58391500,375,367,341,315,467,416,257,423
chr20:58391401-58391600,532,464,456,510,402,435,398,600
chr20:58391501-58391700,850,752,725,694,841,589,549,840


## Preprocess the table: Extract only tiling regions

In [14]:
get_strand = function(x){ifelse(is.na(x), "+", "-")}
add_chrom  = function(x){ifelse(str_detect(x, "chr"), x, paste0("chr", x))}

fun_extract   = function(dat) {
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     Preprocess the raw dataframe
    ###     - expand duplicates
    ###     - filter and extract only tiling oligos
    ###     - get location information
    ### ===========================================
    ### init regex pattern
    pattern = "^[chr|0-9|X]+:[0-9]*-[0-9]*"
    
    ### preprocess
    res = dat %>% 
        dplyr::mutate(Name = gsub("\\(|\\)", "", Name)) %>% 
        tidyr::separate_rows(Name, sep=";") %>%
        dplyr::filter(!grepl(pattern = "Nadav", Name)) %>%
        dplyr::filter(str_detect(string=Name, pattern=pattern)) %>%
        tidyr::separate(Name, c("Chrom", "Start", "End", "Strand"), remove=FALSE, fill="right") %>%
        dplyr::mutate(
            Chrom  = add_chrom(Chrom),
            Start  = as.integer(Start),
            End    = as.integer(End),
            Name   = add_chrom(Name),
            Strand = get_strand(Strand))
    return(res)
}

In [15]:
### init
lst = lst_dat_read

### before preprocessed
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### after proprocessed
lst = lapply(lst, fun_extract)
lst_dat_extract = lst
print(lapply(lst, dim))

$OL13_20220512.hg19.raw
[1] 55229     9

$OL13_20220512.hg19.norm
[1] 55229     9

$OL43_20221003.hg38.raw
[1] 99307    12

$OL43_20221003.hg38.norm
[1] 99307    12

$OL45_20220927.hg38.raw
[1] 94381     9

$OL45_20220927.hg38.norm
[1] 94381     9

++++++++++++++++++++++++++++++++++++++++
$OL13_20220512.hg19.raw
[1] 43907    13

$OL13_20220512.hg19.norm
[1] 43907    13

$OL43_20221003.hg38.raw
[1] 96108    16

$OL43_20221003.hg38.norm
[1] 96108    16

$OL45_20220927.hg38.raw
[1] 91110    13

$OL45_20220927.hg38.norm
[1] 91110    13



**Show results**

In [16]:
set.seed(123)
lst = lst_dat_extract
lapply(lst, slice_sample, n=3)

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr11:61559376-61559575_RC,chr11,61559376,61559575,-,363,256,368,252,0,32,0,37
chr11:61628981-61629180_RC,chr11,61628981,61629180,-,823,497,638,344,78,300,263,298
chr11:61628421-61628620,chr11,61628421,61628620,+,400,378,637,250,138,165,295,236

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11:61648876-61649075_RC,chr11,61648876,61649075,-,303.0849,283.1768,304.58401,238.8869,179.7249,181.1592,0.0,363.8974
chr11:61558796-61558995,chr11,61558796,61558995,+,488.8819,550.676,591.3955,539.9232,842.9954,215.3781,674.4801,461.588
chr11:61652526-61652725,chr11,61652526,61652725,+,118.932,117.5821,86.41116,140.8073,0.0,255.6357,442.8662,124.5555

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Output.rep5
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr8:127212351-127212550,chr8,127212351,127212550,+,498,576,544,596,231,270,228,201,369,182,251
chrX:48539581-48539780,chrX,48539581,48539780,+,271,290,221,253,108,139,15877,10329,17175,14124,14530
chrX:48294251-48294450,chrX,48294251,48294450,+,181,221,277,253,96,106,38,73,128,74,105

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Output.rep5
<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr8:127441201-127441400,chr8,127441201,127441400,+,233.6449,222.3104,171.1604,221.1921,215.1536,252.4283,228.0332,121.5305,212.6315,184.5061,156.8328
chr8:127491151-127491350,chr8,127491151,127491350,+,438.3367,409.4564,412.1954,390.8565,355.9815,352.6189,642.2158,616.5884,457.6434,496.6147,508.6468
chr8:128028501-128028700,chr8,128028501,128028700,+,1120.4181,1169.3644,1135.3002,1182.6237,1090.1117,1122.9157,1002.1049,886.4576,943.3499,1046.6844,916.9771

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr20:58250701-58250900,chr20,58250701,58250900,+,225,216,217,192,120,105,121,98
chr11:33783701-33783900,chr11,33783701,33783900,+,786,596,647,773,2050,1933,1917,2263
chr11:5570301-5570500,chr11,5570301,5570500,+,294,254,226,187,158,137,110,250

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr2:60047901-60048100,chr2,60047901,60048100,+,635.3226,677.6804,700.1009,686.1342,446.9841,381.6018,419.6503,503.62848
chr11:33672501-33672700,chr11,33672501,33672700,+,711.149,671.5987,617.9851,668.9163,507.3208,438.6918,417.9581,714.68343
chr11:34229601-34229800,chr11,34229601,34229800,+,128.8313,118.1597,126.1367,141.187,155.1515,174.2748,135.3711,92.68551


In [17]:
lst = lst_dat_extract
lapply(lst, function(dat){table(dat$Chrom)})

$OL13_20220512.hg19.raw

chr11 
43907 

$OL13_20220512.hg19.norm

chr11 
43907 

$OL43_20221003.hg38.raw

 chr8  chrX 
41905 54203 

$OL43_20221003.hg38.norm

 chr8  chrX 
41905 54203 

$OL45_20220927.hg38.raw

chr11 chr16  chr2 chr20 
39857 11439 19917 19897 

$OL45_20220927.hg38.norm

chr11 chr16  chr2 chr20 
39857 11439 19917 19897 


## Filter out the fragments (Oligos) where mean input is zero

**Helper function**

In [18]:
fun_filter = function(dat){
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     calculate the mean of input for each fragment and 
    ###     filter out the fragment w/ plasmid mean is zero
    ###     - convert wide matrix to long matrix
    ###     - group by the input and output to calculate the mean
    ###     - get the non-zero fragments from the input dataframe
    
    ### calculate the mean of input for each fragment and 
    tmp = dat %>% 
        tidyr::gather(Sample, Value, -Chrom, -Start, -End, -Name, -Strand) %>% 
        tidyr::separate(Sample, c("Group", "Rep"), sep="\\.", remove=FALSE) %>%
        dplyr::filter(Group == "Input") %>%
        dplyr::group_by(Chrom, Start, End, Name, Strand, Group) %>%
        dplyr::summarise(Value = mean(Value), .groups="drop")
    
    ### filter out the fragment w/ plasmid mean is zero
    tmp = tmp %>% dplyr::filter(Value > 0)
    idx = unique(tmp$Name)
    res = dat %>% dplyr::filter(Name %in% idx)
    return(res)
}

**Filter**

In [19]:
### init
lst = lst_dat_extract

### show info before processing
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### processing
lst = lapply(lst, fun_filter)

### assign and show info
lst_dat_filter = lst
print(lapply(lst, dim))

$OL13_20220512.hg19.raw
[1] 43907    13

$OL13_20220512.hg19.norm
[1] 43907    13

$OL43_20221003.hg38.raw
[1] 96108    16

$OL43_20221003.hg38.norm
[1] 96108    16

$OL45_20220927.hg38.raw
[1] 91110    13

$OL45_20220927.hg38.norm
[1] 91110    13

++++++++++++++++++++++++++++++++++++++++
$OL13_20220512.hg19.raw
[1] 43871    13

$OL13_20220512.hg19.norm
[1] 43871    13

$OL43_20221003.hg38.raw
[1] 96087    16

$OL43_20221003.hg38.norm
[1] 96087    16

$OL45_20220927.hg38.raw
[1] 91092    13

$OL45_20220927.hg38.norm
[1] 91092    13



**Show results**

In [20]:
lst = lst_dat_filter
dat = lst[[1]]
head(dat)

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr11:61555216-61555415,chr11,61555216,61555415,+,1609,1221,1396,798,845,283,574,1223
chr11:61555231-61555430_RC,chr11,61555231,61555430,-,1179,582,1225,721,476,416,912,956
chr11:61555216-61555415_RC,chr11,61555216,61555415,-,1066,643,1206,681,470,846,540,847
chr11:61555231-61555430,chr11,61555231,61555430,+,1063,701,786,416,357,568,599,518
chr11:61555366-61555565_RC,chr11,61555366,61555565,-,1239,722,1341,682,311,421,604,556
chr11:61555366-61555565,chr11,61555366,61555565,+,1362,885,1213,856,407,49,519,344


## Calculate mean for normalized count table

**Helper function**

In [21]:
fun_add_mean_column = function(dat){
    ### get input and output to calculate log2FC
    x_inp = dat %>% dplyr::select(starts_with("Input"))  %>% apply(., 1, mean)
    x_out = dat %>% dplyr::select(starts_with("Output")) %>% apply(., 1, mean)
    x_lfc = log2(x_out) - log2(x_inp)
    
    ### add columns
    dat$Input.mean  = x_inp
    dat$Output.mean = x_out
    dat$Log2FC.mean = x_lfc
 
    return(dat)
}

**Calculate**

In [22]:
lst = lst_dat_filter
print(names(lst))
cat("=======================\n")
sid = grep("norm", names(lst), value=TRUE)
print(sid)

[1] "OL13_20220512.hg19.raw"  "OL13_20220512.hg19.norm"
[3] "OL43_20221003.hg38.raw"  "OL43_20221003.hg38.norm"
[5] "OL45_20220927.hg38.raw"  "OL45_20220927.hg38.norm"
[1] "OL13_20220512.hg19.norm" "OL43_20221003.hg38.norm"
[3] "OL45_20220927.hg38.norm"


In [23]:
### init
lst  = lst_dat_filter
idxs = grep("norm", names(lst), value=TRUE)

### add mean for normalization count
for (idx in idxs){
    cat(idx, "\n")
    lst[[idx]] = fun_add_mean_column(lst[[idx]])
}

### assigned
lst_dat_prep = lst

OL13_20220512.hg19.norm 
OL43_20221003.hg38.norm 
OL45_20220927.hg38.norm 


**Show results**

In [24]:
lst = lst_dat_prep
idx = names(lst)[1]
dat = lst_dat_prep[[1]]

cat(idx)
head(dat, 3)

OL13_20220512.hg19.raw

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr11:61555216-61555415,chr11,61555216,61555415,+,1609,1221,1396,798,845,283,574,1223
chr11:61555231-61555430_RC,chr11,61555231,61555430,-,1179,582,1225,721,476,416,912,956
chr11:61555216-61555415_RC,chr11,61555216,61555415,-,1066,643,1206,681,470,846,540,847


In [25]:
lst = lst_dat_prep
idx = names(lst)[2]
dat = lst_dat_prep[[2]]

cat(idx)
head(dat, 3)

OL13_20220512.hg19.norm

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Input.mean,Output.mean,Log2FC.mean
<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11:61555216-61555415,chr11,61555216,61555415,+,881.851,1196.3976,855.5317,774.9257,1807.947,569.6449,730.4747,1493.445,927.1765,1150.378,0.3111919
chr11:61555231-61555430_RC,chr11,61555231,61555430,-,646.1792,570.273,750.7352,700.1522,1018.441,837.3579,1160.6148,1167.403,666.8349,1045.954,0.649418
chr11:61555216-61555415_RC,chr11,61555216,61555415,-,584.2468,630.0439,739.0912,661.3088,1005.604,1702.8961,687.2061,1034.299,653.6727,1107.501,0.760668


In [26]:
lst = lst_dat_prep
idx = names(lst)[3]
dat = lst_dat_prep[[3]]

cat(idx)
head(dat, 3)

OL43_20221003.hg38.raw

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Output.rep5
<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr8:127381651-127381850,chr8,127381651,127381850,+,1318,1810,1477,1505,738,687,620,636,781,520,538
chr8:127320401-127320600,chr8,127320401,127320600,+,1318,1810,1477,1505,738,687,620,636,781,520,538
chr8:127742001-127742200,chr8,127742001,127742200,+,634,730,698,761,316,343,158,194,268,141,251


In [27]:
lst = lst_dat_prep
idx = names(lst)[4]
dat = lst_dat_prep[[4]]

cat(idx)
head(dat, 3)

OL43_20221003.hg38.norm

Name,Chrom,Start,End,Strand,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4,Output.rep5,Input.mean,Output.mean,Log2FC.mean
<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr8:127381651-127381850,chr8,127381651,127381850,+,887.4466,1078.7715,946.8314,945.7219,962.3235,893.9085,961.7725,1136.6673,842.9706,896.6654,760.1443,952.5006,919.644,-0.0506444
chr8:127320401-127320600,chr8,127320401,127320600,+,887.4466,1078.7715,946.8314,945.7219,962.3235,893.9085,961.7725,1136.6673,842.9706,896.6654,760.1443,952.5006,919.644,-0.0506444
chr8:127742001-127742200,chr8,127742001,127742200,+,426.8901,435.0846,447.4531,478.2022,412.0518,446.3037,245.0969,346.7193,289.2652,243.1343,354.6398,440.9976,295.7711,-0.5762898


## Store the fragments and counts
```
* OL13 (FADS)
* OL43 (GATA/MYC)
* OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)

- OL13_20220512_counts.out
- OL13_20220512_normalized_counts.out
- OL43_20211228_counts.out      
- OL43_20211228_normalized_counts.out
- OL43_20221003_counts.out 
- OL43_20221003_K562_normalized_counts.out
- OL45_20220927_counts.out 
- OL45_20220927_K562_normalized_counts.out

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Fragments

OL13_20220512.hg19.stranded_pos.bed
OL43_20221003.hg38.stranded_pos.bed
OL45_20221003.hg38.stranded_pos.bed

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Counts: OL13_20220512

OL13_20220512.hg19.raw.Input .rep[1-4].stranded_pos.bed
OL13_20220512.hg19.raw.Output.rep[1-4].stranded_pos.bed

OL13_20220512.hg19.norm.Input .mean.stranded_pos.bed
OL13_20220512.hg19.norm.Output.mean.stranded_pos.bed
OL13_20220512.hg19.norm.Log2FC.mean.stranded_pos.bed

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Counts: OL43_20221003

OL43_20221003.hg38.raw.Input .rep[1-6].stranded_pos.bed
OL43_20221003.hg38.raw.Output.rep[1-5].stranded_pos.bed

OL43_20221003.hg38.norm.Input .rep[1-6].stranded_pos.bed
OL43_20221003.hg38.norm.Output.rep[1-5].stranded_pos.bed

OL43_20221003.hg38.norm.Input .mean.stranded_pos.bed
OL43_20221003.hg38.norm.Output.mean.stranded_pos.bed
OL43_20221003.hg38.norm.Log2FC.mean.stranded_pos.bed

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Counts: OL45_20220927

OL43_20221003.hg38.raw. Input .rep[1-4].stranded_pos.bed
OL43_20221003.hg38.norm.Output.rep[1-4].stranded_pos.bed

OL43_20221003.hg38.norm.Input .rep[1-4].stranded_pos.bed
OL43_20221003.hg38.norm.Output.rep[1-4].stranded_pos.bed

OL43_20221003.hg38.norm.Input .mean.stranded_pos.bed
OL43_20221003.hg38.norm.Output.mean.stranded_pos.bed
OL43_20221003.hg38.norm.Log2FC.mean.stranded_pos.bed
```

## Store fragment counts

In [28]:
#CHROMS = paste0("chr", c(1:22, "X", "Y"))
#CHROMS

**Store fragments**

In [29]:
lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)
idxs

In [30]:
assay  = "Tewhey_K562_TileMPRA"
folder = "fragment"
strand = "stranded_pos"

lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    #dat$Chrom = factor(dat$Chrom, levels=CHROMS)
    
    ### extract positive trands and order by positions
    dat = dat %>% 
       dplyr::filter(Strand == "+") %>% 
       dplyr::arrange(Chrom, Start, End)
    
    ### arrange the column into bed file format
    dat = dat %>%
         dplyr::mutate(Score = ".") %>%
         dplyr::select(Chrom, Start, End, Name, Score, Strand)
    
    ### save table
    fdiry = file.path(FD_RES, "results", assay, folder)
    fname = paste(idx, strand, "bed", sep=".")
    fpath = file.path(fdiry, fname)
    write_tsv(dat, fpath, col_names=FALSE)

    ### show progress
    print(head(dat))
    cat("\nSaved Table:", fname, "\n")
}


Sample: OL13_20220512.hg19.raw 
[90m# A tibble: 6 × 6[39m
  Chrom    Start      End Name                    Score Strand
  [3m[90m<chr>[39m[23m    [3m[90m<int>[39m[23m    [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m                   [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m 
[90m1[39m chr11 61[4m5[24m[4m5[24m[4m4[24m801 61[4m5[24m[4m5[24m[4m5[24m000 chr11:61554801-61555000 .     +     
[90m2[39m chr11 61[4m5[24m[4m5[24m[4m4[24m806 61[4m5[24m[4m5[24m[4m5[24m005 chr11:61554806-61555005 .     +     
[90m3[39m chr11 61[4m5[24m[4m5[24m[4m4[24m811 61[4m5[24m[4m5[24m[4m5[24m010 chr11:61554811-61555010 .     +     
[90m4[39m chr11 61[4m5[24m[4m5[24m[4m4[24m816 61[4m5[24m[4m5[24m[4m5[24m015 chr11:61554816-61555015 .     +     
[90m5[39m chr11 61[4m5[24m[4m5[24m[4m4[24m821 61[4m5[24m[4m5[24m[4m5[24m020 chr11:61554821-61555020 .     +     
[90m6[39m chr11 61[4m5[24m[4m5[24m[4m4[24m826 61[4m5[2

**Store table of counts**

In [31]:
assay  = "Tewhey_K562_TileMPRA"
folder = "fragment_count"
strand = "stranded_pos"

lst  = lst_dat_prep
idxs = names(lst)

fdiry = file.path(FD_RES, "results", assay, folder, "summary")
cat("Saved directory:", "\n")
cat(fdiry, "\n")

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    #dat$Chrom = factor(dat$Chrom, levels=CHROMS)
    
    ### extract positive trands and order by positions
    dat = dat %>% 
       dplyr::filter(Strand == "+") %>% 
       dplyr::arrange(Chrom, Start, End)
    
    ### save table
    fdiry = file.path(FD_RES, "results", assay, folder, "summary")
    fname = paste(idx, strand, "tsv", sep=".")
    fpath = file.path(fdiry, fname)
    write_tsv(dat, fpath)

    ### show progress
    print(head(dat))
    cat("\nSaved Table:", fname, "\n")
}

Saved directory: 
/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/fragment_count/summary 

Sample: OL13_20220512.hg19.raw 
[90m# A tibble: 6 × 13[39m
  Name        Chrom  Start    End Strand Input…¹ Input…² Input…³ Input…⁴ Outpu…⁵
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m  [3m[90m<int>[39m[23m  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m    [3m[90m<int>[39m[23m   [3m[90m<int>[39m[23m   [3m[90m<int>[39m[23m   [3m[90m<int>[39m[23m   [3m[90m<int>[39m[23m
[90m1[39m chr11:6155… chr11 6.16[90me[39m7 6.16[90me[39m7 +          971     542     785     640     112
[90m2[39m chr11:6155… chr11 6.16[90me[39m7 6.16[90me[39m7 +         [4m1[24m267     562    [4m1[24m059     564     211
[90m3[39m chr11:6155… chr11 6.16[90me[39m7 6.16[90me[39m7 +         [4m1[24m183     641    [4m1[24m118     655     261
[90m4[39m chr11:6155… chr11 6.16[90me[39m7 6.16[90me[39m7 +         [4m1[24m020     476