**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



## Import data

**Import library design**

In [2]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram_230621"
txt_fdiry = file.path(FD_DAT, "processed", txt_assay)
txt_fname = "LentiMPRA.K562.Library_Design.csv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_csv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_lmpra_library = dat
print(dim(dat))
fun_display_table(head(dat))

[1] 243780      7


name,category,chr.hg38,start.hg38,stop.hg38,str.hg38,230nt sequence (15nt 5' adaptor - 200nt element - 15nt 3' adaptor)
peak1,potential enhancer,chr1,115631,115831,+,AGGACCGGATCAACTGAGCCGGGTCATGAAAAAGGGGATCTTGTGTGTCTGTCCACGATAAGCACTATCACAAGGACTTTCTATAAACTCACAAGAAATTTCTGCCCACCCAGCACACAGTTTGTCCAGCTCATCCTGTAGGTGTCTCTATAATAGGACCTATCATAAAAAATTCCTCAAGACTGCAGCATTTCAGATAAGCCACCCTCACAAGACATTGCGTGAACCGA
peak1_Reversed:,potential enhancer,chr1,115631,115831,-,AGGACCGGATCAACTTCTTGTGAGGGTGGCTTATCTGAAATGCTGCAGTCTTGAGGAATTTTTTATGATAGGTCCTATTATAGAGACACCTACAGGATGAGCTGGACAAACTGTGTGCTGGGTGGGCAGAAATTTCTTGTGAGTTTATAGAAAGTCCTTGTGATAGTGCTTATCGTGGACAGACACACAAGATCCCCTTTTTCATGACCCGGCTCCATTGCGTGAACCGA
peak10,potential enhancer,chr1,634309,634509,+,AGGACCGGATCAACTTGTCGCCTTAATCCAAGCCTACGTTTTTACACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCCATGGCCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATTTCACTTCCACTCCACAACCCTCCTCATACTCATTGCGTGAACCGA
peak10_Reversed:,potential enhancer,chr1,634309,634509,-,AGGACCGGATCAACTAGTATGAGGAGGGTTGTGGAGTGGAAGTGAAATCACATGGCTAGGCCGGAGGTCATTAGGAGGGCTGAGAGGGCCCCTGTTAGGGGCCATGGGCTGGGTTTTACTATATGATAGGCATGTGATTGGTGGGTCATTATGTGTTGTCGTGCAGGTAGAGGCTTACTAGAAGTGTAAAAACGTAGGCTTGGATTAAGGCGACACATTGCGTGAACCGA
peak100,potential enhancer,chr1,2133494,2133694,+,AGGACCGGATCAACTCCCCTCCCCAGCTGTGCGCCCGCCCCTTGGTTCCACCCCCCCCAGCTGTGCATCCGTCCCTTGGCTCCGCCCCGCACTGTGCGTCCATTTTTGACTCCGCCCCCGGCTGTGCGCTCATCCCTCGGCTCCGCCCCCGGCTGTGCGTCCGTCCCTCGGTTCCGCCCCCGGCTGCGCGTCTGTCCCTCGACTCGGCCCCTCAGCATTGCGTGAACCGA
peak100_Reversed:,potential enhancer,chr1,2133494,2133694,-,AGGACCGGATCAACTCTGAGGGGCCGAGTCGAGGGACAGACGCGCAGCCGGGGGCGGAACCGAGGGACGGACGCACAGCCGGGGGCGGAGCCGAGGGATGAGCGCACAGCCGGGGGCGGAGTCAAAAATGGACGCACAGTGCGGGGCGGAGCCAAGGGACGGATGCACAGCTGGGGGGGGTGGAACCAAGGGGCGGGCGCACAGCTGGGGAGGGGCATTGCGTGAACCGA


**Import effect size**

In [3]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram_230621"
txt_fdiry = file.path(FD_DAT, "processed", txt_assay)
txt_fname = "LentiMPRA.K562.Log2FC.matrix.csv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_csv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_lmpra_score_matrix = dat
print(dim(dat))
fun_display_table(head(dat))

[1] 226254      5


name,replicate 1 [log2(rna/dna)],replicate 2 [log2(rna/dna)],replicate 3 [log2(rna/dna)],mean
peak29573_Reversed:,-0.19,0.225,0.201,0.079
peak71109_Reversed:,0.65,0.454,0.758,0.62
ENSG00000123569,-0.762,-0.578,-0.228,-0.523
peak25742_Reversed:,-0.511,-0.644,-0.154,-0.436
peak83398,-0.154,-0.298,0.103,-0.116
peak1037_Reversed:,-0.056,0.275,0.008,0.076


## Arrange data and Z score transformation

**Join library design and score matrix**

In [4]:
### merge score by oligo's ID
dat = dplyr::left_join(
    dat_lmpra_score_matrix,
    dat_lmpra_library,
    by="name"
)

### arrange table
dat = dat %>% 
    dplyr::select(
        chr.hg38, 
        start.hg38, 
        stop.hg38, 
        name, 
        mean, 
        str.hg38, 
        category)
colnames(dat) = c("Chrom", "ChromStart", "ChromEnd", "Name", "Score", "Strand", "Category")

### assign and show
dat_lmpra_score_arrange = dat
print(dim(dat))
fun_display_table(head(dat))

[1] 226254      7


Chrom,ChromStart,ChromEnd,Name,Score,Strand,Category
chr16,3163448,3163648,peak29573_Reversed:,0.079,-,potential enhancer
chr6,125083195,125083395,peak71109_Reversed:,0.62,-,potential enhancer
chrX,104013573,104013773,ENSG00000123569,-0.523,-,promoter
chr14,76947662,76947862,peak25742_Reversed:,-0.436,-,potential enhancer
chr9,98271532,98271732,peak83398,-0.116,+,potential enhancer
chr1,17059727,17059927,peak1037_Reversed:,0.076,-,potential enhancer


**Extract Log2FC of negative control (Shuffled sequences)**

In [5]:
### extract negative control
dat = dat_lmpra_score_arrange
lst = split(dat, dat$Category)
dat = lst[["negative control, shuffled"]]

### calculate mean and sd
vec = dat$Score
num_mu = mean(vec)
num_sd = sd(vec)

### assign and show
dat_lmpra_score_negative_shuffled = dat
print(dim(dat))
cat("Mean =",  num_mu, "\n")
cat("Std  = ", num_sd, "\n")
fun_display_table(head(dat))

[1] 247   7
Mean = -0.5249433 
Std  =  0.2754342 


Chrom,ChromStart,ChromEnd,Name,Score,Strand,Category
,,,seq5206_shuffled_0,-0.453,,"negative control, shuffled"
,,,seq31328_shuffled_0,-0.768,,"negative control, shuffled"
,,,seq12212_shuffled_0,-0.6,,"negative control, shuffled"
,,,seq9803_shuffled_0,-0.933,,"negative control, shuffled"
,,,seq2756_shuffled_0,-0.545,,"negative control, shuffled"
,,,seq13776_shuffled_0,-0.837,,"negative control, shuffled"


**Z Score transformation**

In [6]:
### z score transformation
dat = dat_lmpra_score_arrange
dat = dat %>% 
    dplyr::mutate(ZScore = (Score - num_mu) / num_sd) %>% 
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

### assign and show
dat_lmpra_score_full = dat
dat_lmpra_score_log2fc = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Name,  Score, Strand)
dat_lmpra_score_zscore = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Name, ZScore, Strand)
fun_display_table(head(dat))

Chrom,ChromStart,ChromEnd,Name,Score,Strand,Category,ZScore
chr1,115631,115831,peak1_Reversed:,0.04,-,potential enhancer,2.0511008
chr1,138940,139140,peak2_Reversed:,-0.101,-,potential enhancer,1.5391818
chr1,138940,139140,peak2,-0.543,+,potential enhancer,-0.0655571
chr1,203225,203425,peak3,-0.186,+,potential enhancer,1.2305781
chr1,203225,203425,peak3_Reversed:,-0.281,-,potential enhancer,0.8856682
chr1,267911,268111,peak4_Reversed:,-0.967,-,potential enhancer,-1.6049447


## Save results

**Write table**

In [7]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram"
txt_folder = "fragment_counts"
txt_fdiry  = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
txt_fname  = "LMPRA_K562.hg38.Log2FC.stranded.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### init
txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write table
dat = dat_lmpra_score_log2fc
write_tsv(dat, txt_fpath, col_names=FALSE)

In [8]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram"
txt_folder = "fragment_counts"
txt_fdiry  = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
txt_fname  = "LMPRA_K562.hg38.ZScore.stranded.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### init
txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write table
dat = dat_lmpra_score_zscore
write_tsv(dat, txt_fpath, col_names=FALSE)

**Check results**

In [17]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram"
txt_folder = "fragment_counts"
txt_fdiry  = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
txt_fname  = "LMPRA_K562.hg38.Log2FC.stranded.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### 
txt_cmd = paste("zcat", txt_fpath, "| head")
vec     = system(txt_cmd, intern = TRUE)
for(txt in vec){cat(txt, "\n")}

chr1	115631	115831	peak1_Reversed:	0.04	- 
chr1	138940	139140	peak2_Reversed:	-0.101	- 
chr1	138940	139140	peak2	-0.543	+ 
chr1	203225	203425	peak3	-0.186	+ 
chr1	203225	203425	peak3_Reversed:	-0.281	- 
chr1	267911	268111	peak4_Reversed:	-0.967	- 
chr1	267911	268111	peak4	0.225	+ 
chr1	586093	586293	peak5	-0.441	+ 
chr1	586093	586293	peak5_Reversed:	-0.108	- 
chr1	629847	630047	peak6_Reversed:	-0.923	- 


In [18]:
### set directory
txt_assay  = "MPRA_Lenti_K562_Nadav_Vikram"
txt_folder = "fragment_counts"
txt_fdiry  = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
txt_fname  = "LMPRA_K562.hg38.ZScore.stranded.bed.gz"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### 
txt_cmd = paste("zcat", txt_fpath, "| head")
vec     = system(txt_cmd, intern = TRUE)
for(txt in vec){cat(txt, "\n")}

chr1	115631	115831	peak1_Reversed:	2.0511007569226227	- 
chr1	138940	139140	peak2_Reversed:	1.5391817792648435	- 
chr1	138940	139140	peak2	-0.06555714360564233	+ 
chr1	203225	203425	peak3	1.2305781402512883	+ 
chr1	203225	203425	peak3_Reversed:	0.8856681907655505	- 
chr1	267911	268111	peak4_Reversed:	-1.6049447076261982	- 
chr1	267911	268111	peak4	2.7227675006580068	+ 
chr1	586093	586293	peak5	0.30476722321062366	+ 
chr1	586093	586293	peak5_Reversed:	1.5137673619343153	- 
chr1	629847	630047	peak6_Reversed:	-1.4451969415485937	- 
