In [1]:
%%writefile /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/process_STARR.r
#!/usr/bin/env Rscript

suppressPackageStartupMessages({
    library(Rsamtools)
    library(GenomicAlignments)
    library(GenomicFiles)
    library(optparse)
    library(data.table)
    library(optparse)
    library(rtracklayer)
    library(GenomicRanges)
    library(RColorBrewer)
    library(doParallel)
    library(argparse)
});

# create parser object and add parser arguments
parser <- ArgumentParser()

parser$add_argument("-i", "--input", nargs="+", help="BAM files with reads")
parser$add_argument("-p", "--cores", default=4, help="Number of cores to use in multicore processing")
parser$add_argument("--UMI", required=F, action="store_true", default=FALSE, help="Specify if UMI/barcode present for fragments")
parser$add_argument("-o", "--outfile", required=F, help="Output filename to create bed file.")

args <- parser$parse_args()


registerDoParallel(cores=args$cores);

# Shorthand for "pretty number" formatting
pn = function(value) {
    prettyNum(value, big.mark=",")
}

# Shorthand to print the full argument list
msgout = function(...) {
    write(paste(...), stdout());
}

if (!(args$UMI)){
    bfilters = ScanBamParam(mapqFilter=10, flag=scanBamFlag(isSecondaryAlignment=F));
} else {
    bfilters = ScanBamParam(mapqFilter=10, flag=scanBamFlag(isSecondaryAlignment=F, isDuplicate=F));
}

count_reads = function(reads) {
    uniq = unique(reads);
    # sum over duplicates to get a count for each unique 5'/3' end
    uniq$count = countOverlaps( uniq, reads, type="equal" );
    return( uniq );
}

yield.bam = function(X) {
    y = GRanges( readGAlignmentPairs(X, use.names=F, param=bfilters ));
    return(y);
}

map.bam = function(X) {
    return(X);
}

reduce.bam = function(x, y) {
    x = append(x, y);
    # print the number of readpairs processed
    msgout(pn(length(x)), 'mapped human reads');
    return(x);
}

merge_counts = function( x, y, name ) {
    if(length(x)) {
        hits = findOverlaps(x, y, type="equal");
        mcols(x)[hits@from, name] = mcols(x)[hits@from, name] + mcols(y)[hits@to, name];
        y = y[-hits@to,];
        for( cn in colnames(mcols(x))) {
            if( !cn %in% colnames(mcols(y)) ) {
                mcols(y)[,cn] = 0;
            }
        }
        mcols(y) = mcols(y)[,colnames(mcols(x))];
        colnames(mcols(y)) = colnames(mcols(x));
    }
    return(append(x, y));
}


ctFile = args$input;
msgout( "Processing ", ctFile );
infile = BamFile(ctFile, yieldSize=1 * 10^6, asMates=T );
aligned = reduceByYield( infile, yield.bam, map.bam, reduce.bam, parallel=F );

#msgout(pn(length(aligned)), 'mapped reads');

# compute coverage from identical reads => 'count' column
seqlib = count_reads(aligned);
print(seqlib)

# convert the GRanges object to dataframe and add columns corresponding to name, score and barcode information
# as per the common file format needed

seqlib = as.data.frame(seqlib)
cols_to_add = data.frame(name=paste0(seqlib$seqnames,"_", seqlib$start,"_",seqlib$end), 
                         score = pmin(seqlib$count,1000), 
                         barcode = '.')

seqlib_df = cbind(seqlib, cols_to_add)
col_order = c('seqnames','start','end','name','score','strand','count','barcode')
seqlib_df = seqlib_df[, col_order]

# write the dataframe into a bed file

write.table(seqlib_df, file=paste0(args$outfile), quote=FALSE, sep='\t');

Overwriting /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/process_STARR.r


In [2]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
mkdir -p /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/count
sbatch -pnew,all \
    --array=0-6 \
    --output=/data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/logs/process_STARR.%a.out \
    --cpus-per-task 8 \
    --mem 64G \
    <<'EOF'
#!/bin/bash
FILES=($(/bin/ls -1 /bin/ls -1 /data/reddylab/Alex/encode4_duke/data/data_distribution/bam/*bam))
INFILE=${FILES[${SLURM_ARRAY_TASK_ID}]}
INFILE_ROOTNAME=$(basename ${INFILE})
echo "Processing ${INFILE}: saving to ${INFILE_ROOTNAME/.bam/.Rdata}"
Rscript /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/process_STARR.r \
    -i ${INFILE} \
    --UMI \
    -o /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/count/${INFILE_ROOTNAME/.bam/.bed} 
EOF

Submitted batch job 24998610


In [4]:
%%writefile /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/import_STARR.r

#!/usr/bin/env Rscript

suppressPackageStartupMessages({
    library(Rsamtools)
    library(GenomicAlignments)
    library(GenomicFiles)
    library(data.table)
    library(optparse)
    library(rtracklayer)
    library(GenomicRanges)
    library(RColorBrewer)
    library(doParallel)
    library(argparse)
});
registerDoParallel(cores=7);

## create parser object
parser <- ArgumentParser()

## add arguments for the ArgumentParser

parser$add_argument("-i", "--input", nargs="+", help="Input .RData files")
parser$add_argument("-col", "--col-names", nargs="+", help="Column names for the files")
#parser$add_argument("-p", "--cores", default=4, help="Number of cores to use in multicore processing")
parser$add_argument("-o", "--outfile", nargs="+", help="Output file name to create the .txt file")
args <- parser$parse_args()

#registerDoParallel(cores=args$cores);

merge_counts = function( x, y, name ) {
    if(length(x)) {
        hits = findOverlaps(x, y, type="equal");
        mcols(x)[hits@from, name] = mcols(x)[hits@from, name] + mcols(y)[hits@to, name];
        y = y[-hits@to,];
        for( cn in colnames(mcols(x))) {
            if( !cn %in% colnames(mcols(y)) ) {
                mcols(y)[,cn] = 0;
            }
        }
        mcols(y) = mcols(y)[,colnames(mcols(x))];
        colnames(mcols(y)) = colnames(mcols(x));
    }
    return(append(x, y));
}

starr_count = GRanges();

Samples = args$input;
colName = args$col;

for( i in 1:length(Samples) ) {
    bcl = Samples[i];
    bcn = colName[i];
    message(bcn);
    load(Samples[i]); ## path to each file
    x = seqlib;
    
    colnames(mcols(x)) = c(bcn);
    mcols(starr_count)[,bcn] = 0;
    starr_count = merge_counts(starr_count, x, bcn)
    print(starr_count)
}

write.table(starr_count, file=paste0(args$outfile), quote=FALSE, sep='\t');

Overwriting /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/import_STARR.r


In [5]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
mkdir -p /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/logs
mkdir -p /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/tmp
sbatch -pnew,all \
    --output=/data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/logs/import_STARR.junke.out \
    --cpus-per-task 8 \
    --mem 50G \
    <<'EOF'
#!/bin/bash
FILES=($(/bin/ls -1 /data/reddylab/Alex/encode4_duke/ipynbs/jamborees/20210222_MPRA_STARR_Jamboree/tmp/*Rdata))
INFILE=${FILES[${SLURM_ARRAY_TASK_ID}]}
INFILE_ROOTNAME=$(basename ${INFILE}| cut -d"." -f1,2)
echo "Processing ${INFILE}: saving to ${INFILE_ROOTNAME/.bam/.Rdata}" 
Rscript /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/scripts/import_STARR.r \
    -i ${FILES[*]} \
    -col "inputRep1" "inputRep2" "inputRep3" "outputRep1" "outputRep2" "outputRep3" \
    -o /data/reddylab/Revathy/dev/Jamborees/MPRA_STARRseq/tmp/${INFILE_ROOTNAME}.counts.txt
EOF

Submitted batch job 24712281


In [None]:

getBamFlags <- function(countAll) {
        # get baminfo
    if (isTRUE(countAll)) {
        # if countAll given, count both reads (in PE mode) or all reads (in SE mode)
        bamFlags <- scanBamFlag(
                                isUnmappedQuery = FALSE,
                                isSecondaryAlignment = FALSE
                            )
    } else {
        # else count only R1 (in PE mode)
        bamFlags <- scanBamFlag(
                                isUnmappedQuery = FALSE,
                                isFirstMateRead = TRUE,
                                isSecondaryAlignment = FALSE
                            )
    }
    return(bamFlags)
}

numReadsInBed <- function(regions, bams = NA, countall = FALSE) {
    counts <-
        GenomicAlignments::summarizeOverlaps(
            GenomicRanges::GRangesList(regions),
            reads = Rsamtools::BamFileList(as.character(bams)),
            mode = "Union",
            inter.feature = FALSE,
            param = Rsamtools::ScanBamParam(flag = getBamFlags(countAll = countall))
        )
    numreads <- SummarizedExperiment::assay(counts)
    return(t(numreads))
}


#### upload data into synapse


In [2]:
synapse-uploader -h


NameError: name 'synapse' is not defined