In [1]:
import requests
import json
from tqdm import tqdm
import pysam
from time import time
from multiprocessing import Pool,Manager
import os


In [15]:
#create three arrays with region of interests chromosoms,start,end

targetJson = 'STR_loci_hg19_with_offtargets_20230525.json'
with open(targetJson) as j:
    x=j.read()
aa =json.loads(x)
positions = set()

for roi in aa:
    RR = roi['ReferenceRegion']
    if type(RR)==str:
        positions.add(RR)
    else:
        for o in RR:
            positions.add(o)
    if 'OfftargetRegions' not in roi:
        continue
    OR = roi['OfftargetRegions']
    for o in OR:
        positions.add(o)

        
chroms = [x.split(':')[0].replace('chr','') for x in positions]
starts = [int(x.split(':')[1].split('-')[0]) for x in positions]
ends = [int(x.split(':')[1].split('-')[1]) for x in positions]


sex = ['X','Y']

positions_ns = [(int(x),int(xx),int(xxx)) for x,xx,xxx in zip(chroms,starts,ends) if x not in sex]
positions_s = [(x,int(xx),int(xxx)) for x,xx,xxx in zip(chroms,starts,ends) if x in sex]



In [30]:
positions_ns.sort(key=lambda x: (x[0],x[1]))

positions_s.sort(key=lambda x: (x[1]))


positions = [(str(x[0]),x[1],x[2]) for x in positions_ns]

positions = positions + [x for x in positions_s if x[0]=='X']

positions = positions + [x for x in positions_s if x[0]=='Y']

In [56]:
readlength = 150
padding = 500

readlength = 0
padding = 0
positions_request = []

contigs = list(dict.fromkeys([x[0] for x in positions]))
for contig in contigs:
    done_positions = []
    pos_on_contig = [x for x in positions if x[0]==contig]
    for n,pos in enumerate(pos_on_contig):
        if n in done_positions:
            continue
        
        start = pos[1]
        end = pos[2]
        
        req_end = end+padding+readlength
        for nn in range(n+1,len(pos_on_contig)):
            start_next = pos_on_contig[nn][1]
            end_next = pos_on_contig[nn][2]
            
            if start_next<req_end:
                done_positions.append(nn)
                req_end = end_next+padding+readlength
        
        
        positions_request.append((contig,start,req_end))
                
            
        

In [170]:

xx =['1:0-100000','2:0-500000']
x = pysam.view(bam_aws,'-X' ,bai_aws ,*xx)

In [158]:
# GetChunk function collects all alignments from one target region, alignemnts are returned as strings to allow mutliprocessing


def GetChunk(x):
    bam = x[0]
    bai = x[1]
    c = x[2]
    s = x[3]
    e = x[4]
    
    padding = x[5]
    s = s-padding
    e = e + padding
    cmd = f'{bam} -X {bai} {c}:{s}:{e}'
    cmd = cmd.split(' ')
    bam_chunk = pysam.view(bam_aws,'-X' ,bai_aws ,f'{c}:{s}-{e}')
    return bam_chunk


# GetBam function creates job for each ROI, multiprocessing of the jobs using pool imap, tqdm only for progress bar
# proc parameter controls how many parallele requests we do, padding adds region to the original reg. of interest

def GetBam(bam,bai,chrom,start,stop,outname,padding=50,proc=12):
    mp_split = [(bam,bai,c,s,e,padding) for c,s,e in zip(chrom,start,stop)]
    with Pool(processes=proc) as p:
        r = list(tqdm(p.imap(GetChunk, mp_split), total=len(mp_split)))
    outname_temp = outname.replace('.bam','unsorted.bam')
    with pysam.AlignmentFile(bam,filepath_index=bai) as f:
        with pysam.AlignmentFile(outname_temp,'wb',template=f) as bamout:
            for chunk in r:
                for read in chunk.split('\n')[:-1]:
                    bamout.write(pysam.AlignedSegment.fromstring(read,f.header))
    pysam.sort("-o", outname, outname_temp)
    pysam.index(outname)


In [159]:
outfolder = 'data_out/'

In [160]:
# lists of aws bam and bai links, like created from /get-file-download-links Endpoint
aws_bamLlinks = ['http://localhost:8000/HG00160.mapped.ILLUMINA.bwa.GBR.exome.20120522.bam']
aws_bai_links = ['http://localhost:8000/HG00160.mapped.ILLUMINA.bwa.GBR.exome.20120522.bam.bai']

In [161]:


chroms=[x[0]for x in positions_request]
starts=[x[1]for x in positions_request]
ends=[x[2]for x in positions_request]

for bam_aws,bai_aws in zip(aws_bamLlinks,aws_bai_links):
    x=time() #stop time
    bai_out = f'{outfolder}/http.bai'
    bam_out = f'{outfolder}/http.bam'
        
    if not os.path.exists(bai_out):
        r = requests.get(bai_aws, stream=True)
        with open(bai_out, 'wb') as bai_file:
            bai_file.write(r.content)

    xx = GetBam(bam_aws,bai_out,chroms,starts,ends,bam_out,padding=500, proc=10)
    print(time()-x)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:04<00:00, 62.73it/s]


4.88194727897644
