In [1]:
import requests
import json
from tqdm import tqdm
import pysam
from time import time
from multiprocessing import Pool,Manager
import os


In [2]:
#create three arrays with region of interests chromosoms,start,end

targetJson = 'STR_loci_hg19_with_offtargets_20230525.json'
with open(targetJson) as j:
    x=j.read()
aa =json.loads(x)
positions = set()

for roi in aa:
    RR = roi['ReferenceRegion']
    if type(RR)==str:
        positions.add(RR)
    else:
        for o in RR:
            positions.add(o)
    if 'OfftargetRegions' not in roi:
        continue
    OR = roi['OfftargetRegions']
    for o in OR:
        positions.add(o)

        
chroms = [x.split(':')[0].replace('chr','') for x in positions]
starts = [int(x.split(':')[1].split('-')[0]) for x in positions]
ends = [int(x.split(':')[1].split('-')[1]) for x in positions]

In [3]:
# GetChunk function collects all alignments from one target region, alignemnts are returned as strings to allow mutliprocessing


def GetChunk(x):
    bam = x[0]
    bai = x[1]
    c = x[2]
    s = x[3]
    e = x[4]
    padding = x[5]
    reads = set()
    with pysam.AlignmentFile(bam,filepath_index=bai) as f:
        for r in f.fetch(c,s-padding,e+padding,until_eof=True):
            r=r.to_string()
            if r not in reads:
                reads.add(r)
    return reads


# GetBam function creates job for each ROI, multiprocessing of the jobs using pool imap, tqdm only for progress bar
# proc parameter controls how many parallele requests we do, padding adds region to the original reg. of interest

def GetBam(bam,bai,chrom,start,stop,outname,padding=50,proc=12):
    mp_split = [(bam,bai,c,s,e,padding) for c,s,e in zip(chrom,start,stop)]
    with Pool(processes=proc) as p:
        r = list(tqdm(p.imap(GetChunk, mp_split), total=len(mp_split)))
    rr = set([xx for x in r for xx in x])    
    outname_temp = outname.replace('.bam','unsorted.bam')
    with pysam.AlignmentFile(bam,filepath_index=bai) as f:
        with pysam.AlignmentFile(outname_temp,'wb',template=f) as bamout:
            for r in rr:
                bamout.write(pysam.AlignedSegment.fromstring(r,f.header))
    pysam.sort("-o", outname, outname_temp)
    pysam.index(outname)


In [4]:
outfolder = '../01_data_out'

In [5]:
# lists of aws bam and bai links, like created from /get-file-download-links Endpoint
aws_bamLlinks = []
aws_bai_links = []

In [6]:


    
for bam_aws,bai_aws in zip(aws_bamLlinks,aws_bai_links):
    x=time() #stop time
    bai_out = f'{outfolder}/{n}_{i}_varvis.bai'
    bam_out = f'{outfolder}/{n}_{i}_varvis.bam'
        
    #download index
    if not os.path.exists(bai_out):
        r = requests.get(bai_aws, stream=True)
        with open(bai_out, 'wb') as bai_file:
            bai_file.write(r.content)

    GetBam(bam_aws,bai_out,chroms,starts,ends,bam_out,padding=500, proc=100)
    print(time()-x)