# Integration Site Identification 
- align using BSBolt v0.1.2
- minimum alignment score = L,160,0 or 160
- 150 bp PE sequencing data

### Notebook Setup

In [4]:
import multiprocessing as mp

# import third party libraries
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [5]:
# import local libraries
from IntegrationSiteSearch.CallIntegrationSites import CallConsensusIntegrationSites
from IntegrationSiteSearch.DetectIntegration import ProcessVectorSpanningReads
from IntegrationSiteSearch.IntegrationUtils import get_spanning_reads, stream_mapped_reads

In [6]:
sequencing_directory = ''

In [7]:
sequencing_samples = []

with open(f'{sequencing_directory}samples.txt', 'r') as samples:
    for sample in samples:
        sequencing_samples.append(sample.strip())

In [8]:
alignment_directory = f'{sequencing_directory}alignments/'

## Get Plasmid Integration Spanning Reads

In [9]:
nyes = 'pMSGV1_1G4_A_LY_RetroNYESO1'
mart = 'pMSGV1_MART1TCR'

In [10]:
nyes = 'nyes'

In [12]:
def propogate_error(error):
    raise error

In [13]:
def get_spanning_reads(file_path: str = None, plasmid_names: set = None):
    mapped_reads = {}
    for sam_read in stream_mapped_reads(file_path, excluded_flags=[2, 4, 10, 1024]):
        QNAME, FLAG, RNAME, RNEXT, POS, CIGAR, alignment_score, mapping_reference = sam_read
        plasmid_read = RNAME in plasmid_names
        if QNAME not in mapped_reads:
            mapped_reads[QNAME] = [[sam_read], plasmid_read]
        else:
            if plasmid_read:
                mapped_reads[QNAME][0].append(sam_read)
                mapped_reads[QNAME][1] = plasmid_read
            else:
                mapped_reads[QNAME][0].append(sam_read)
    plasmid_reads = {}
    for qname, read_group in mapped_reads.items():
        if read_group[1]:
            for read in read_group[0]:
                if read[2][0:3] == 'chr':
                    plasmid_reads[qname] = read_group[0]
                    break
    return plasmid_reads

In [14]:
def return_plasmid_reads(file_path, vector_set, return_queue, sample_name):
    plasmid_reads = get_spanning_reads(file_path, vector_set)
    return_queue.put((sample_name, plasmid_reads))

In [15]:
manager = mp.Manager()
pool = mp.Pool(processes=8)
sample_plasmid_reads = manager.Queue()

for sample in sequencing_samples:
    search_kwargs = dict(file_path=f'{alignment_directory}{sample}.dup.bam', 
                         vector_set={nyes},
                         return_queue=sample_plasmid_reads,
                         sample_name=sample)
    pool.apply_async(return_plasmid_reads, kwds=search_kwargs, error_callback=propogate_error)

pool.close()

In [16]:
pbar = tqdm(total=len(sequencing_samples), desc='Processing Samples')

completed_samples = 0

while len(pool._cache):
    update_number = len(sequencing_samples) - len(pool._cache) - completed_samples
    pbar.update(update_number)
    completed_samples += update_number


HBox(children=(IntProgress(value=0, description='Processing Samples', max=48, style=ProgressStyle(description_…

In [17]:
integration_reads = {}

while not sample_plasmid_reads.empty():
    sample, reads = sample_plasmid_reads.get()
    integration_reads[sample] = reads

In [18]:
sample_plasmid_reads = dict(integration_reads)

In [19]:
# clean mapping reads to only include read that map to the appropriate vector
cleaned_vector_reads = {}

for sample in tqdm(sequencing_samples):
    plasmid_reads = sample_plasmid_reads[sample]
    cleaned_plasmid_reads = {}
    vector = nyes
    if sample[0] == 'F':
        vector = nyes
    for read_name, read_group in plasmid_reads.items():
        cleaned_group = []
        vector_mapping = False
        for read in read_group:
            if read[2][0:3] == 'chr':
                cleaned_group.append(read)
            elif read[2] == vector:
                vector_mapping = True
                cleaned_group.append(read)
        if vector_mapping:
            cleaned_plasmid_reads[read_name] = cleaned_group
    cleaned_vector_reads[sample] = cleaned_plasmid_reads

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

### Call Integration Sites

In [20]:
int_processor = ProcessVectorSpanningReads(multibase_threshold=0.05, multiread_threshold=180)

integration_sites = {}

for sample in tqdm(sequencing_samples):
    sample_integration_sites = []
    for read_label, read_group in cleaned_vector_reads[sample].items():
        vector = nyes
        if sample[0] == 'F':
            vector = nyes
        called_int = int_processor.get_integration_sites(read_group, vector=vector)
        if called_int:
            sample_integration_sites.append(called_int)
    integration_sites[sample] = sample_integration_sites

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

In [21]:
homology_regions = [('chr14', 22052000, 22550975), ('chr7', 142328000, 142802725), ('chr7_KI270803v1_alt', 290000, 824901)]

In [29]:
def assess_homology(site, homology_regions):
    for region in homology_regions:
        if site[2] == region[0] and region[1] <= site[3] <= region[2]:
            return True
    return False

In [30]:
cleaned_integration_sites = {}

for sample, sites in integration_sites.items():
    cleaned_values = []
    for site in sites:
        if site[-1] < 800 or site[-1] > 3800:
            if not assess_homology(site, homology_regions):
                cleaned_values.append(site)
    cleaned_integration_sites[sample] = cleaned_values

In [31]:
nyes_annotations = {'SD': (2107, 2120),
 '3_LTR': (5261, 5775),
 '5_LTR': (1531, 2044),
 'SA': (3056, 3080),
 '1G4_alpha': (3343, 4164),
 '1G4_beta': (4246, 5181),
 'P2A': (4189, 4245),
 'Furin': (4165, 4176),
 'SGSG': (4177, 4188),
 'M13_fwd': (1106, 1123),
 'ColE1_origin': (6308, 6990),
 'LacZ_alpha': (967, 1035),
 'LacO': (5946, 5968),
 'AmpR': (7088, 330),
 'MSCV_Prom_1': (5260, 5641),
 'MSCV_Prom_2': (1529, 1910),
 'Retro_NYESO1_F': (4114, 4134),
 'Retro_NYESO1_R': (4210, 4224),
 '2A_Junction_Probe': (4162, 4183),
 'Primer_Cloning_1': (4024, 4044),
 'Primer_Cloning_2': (4295, 4315)}

In [32]:
cut_nyes_annotations = {}

for annotation, span in nyes_annotations.items():
    adjusted_span = []
    for x in span:
        if x < 1250 or x > 6000:
            adjusted_span.append(None)
        else:
            adjusted_span.append(x - 1250)
    print(annotation, adjusted_span)
    if all(adjusted_span):
        cut_nyes_annotations[annotation] = tuple(adjusted_span)

SD [857, 870]
3_LTR [4011, 4525]
5_LTR [281, 794]
SA [1806, 1830]
1G4_alpha [2093, 2914]
1G4_beta [2996, 3931]
P2A [2939, 2995]
Furin [2915, 2926]
SGSG [2927, 2938]
M13_fwd [None, None]
ColE1_origin [None, None]
LacZ_alpha [None, None]
LacO [4696, 4718]
AmpR [None, None]
MSCV_Prom_1 [4010, 4391]
MSCV_Prom_2 [279, 660]
Retro_NYESO1_F [2864, 2884]
Retro_NYESO1_R [2960, 2974]
2A_Junction_Probe [2912, 2933]
Primer_Cloning_1 [2774, 2794]
Primer_Cloning_2 [3045, 3065]


In [33]:
cut_nyes_annotations

{'SD': (857, 870),
 '3_LTR': (4011, 4525),
 '5_LTR': (281, 794),
 'SA': (1806, 1830),
 '1G4_alpha': (2093, 2914),
 '1G4_beta': (2996, 3931),
 'P2A': (2939, 2995),
 'Furin': (2915, 2926),
 'SGSG': (2927, 2938),
 'LacO': (4696, 4718),
 'MSCV_Prom_1': (4010, 4391),
 'MSCV_Prom_2': (279, 660),
 'Retro_NYESO1_F': (2864, 2884),
 'Retro_NYESO1_R': (2960, 2974),
 '2A_Junction_Probe': (2912, 2933),
 'Primer_Cloning_1': (2774, 2794),
 'Primer_Cloning_2': (3045, 3065)}

In [34]:
sample_integration_peaks = {}

caller = CallConsensusIntegrationSites(region_size=1000, minimum_observations=0)

for sample in tqdm(sequencing_samples):
    sample_integration_peaks[sample] = caller.call_integration_sites(cleaned_integration_sites[sample])

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

In [35]:
int_df = pd.DataFrame(sample_integration_peaks)

In [36]:
cleaned_peaks = {}

for sample, peaks in sample_integration_peaks.items():
    cleaned_values = {}
    for site, peak_info in peaks.items():
        if peak_info[-1] < 800 or peak_info[-1] > 3800:
            cleaned_values[site] = peak_info
    cleaned_peaks[sample] = cleaned_values

In [37]:
with open('integration_peaks.bed', 'w') as out:
    for sample, values in cleaned_peaks.items():
        for peak in values:
            chrom, pos = peak.split(':')
            start, end = [int(x) for x in pos.split('-')]
            out.write(f'{chrom}\t{start}\t{end}\t{sample}\n')

In [38]:
formatted_sites = []

for sample, sites in cleaned_peaks.items():
    for site, site_info in sites.items():
        chrom, pos = site.split(':')
        start, end = pos.split('-')
        site_annotation = 'NA'
        for annotation, span in cut_nyes_annotations.items():
            if span[0] <= site_info[2] <= span[1]:
                site_annotation = annotation
                break
        formatted_sites.append(f'{sample}\t{chrom}\t{start}\t{end}\t{site_info[0]}\t{site_info[1]}\t{site_info[2]}\t{site_annotation}\n')

In [39]:
with open('integration_sites.tsv', 'w') as out:
    out.write(f'Sample\tChrom\tStart\tEnd\tAveragePos\tSupportingReads\tVectorPos\tVectorAnnotation\n')
    for line in formatted_sites:
        out.write(line)

In [40]:
with open('day_0_integration_sites.bed', 'w') as out:
    with open('integration_peaks.bed', 'r') as sites:
        for site in sites:
            site_split = site.strip().split('\t')
            day = site_split[-1].split('_')[2]
            if day == '0':
                out.write(site)