# Generate MART and NYES Vector Integration Libraries
- randomly insert vector into location in the genome
- simulate PE reads using BSBolt
- align and try to detect vector integration 

## Notebook Setup

In [30]:
import gzip
import io
import os 
import random
import subprocess
import urllib.request
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [38]:
# simulate methylation sequencing data
from BSBolt.Align.AlignmentHelpers import convert_alpha_numeric_cigar, get_mapping_length
from BSBolt.Index.WholeGenomeBuild import WholeGenomeIndexBuild
from BSBolt.Simulate.SimulateMethylatedReads import SimulateMethylatedReads
from BSBolt.Utils.UtilityFunctions import get_external_paths
from IntegrationSiteSearch.DetectIntegration import ProcessVectorSpanningReads
from IntegrationSiteSearch.IntegrationUtils import get_spanning_reads

In [10]:
bt2_path, art_path = get_external_paths()

In [11]:
pwd = os.getcwd() + '/'

In [12]:
ucsc_hg38 = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz'

In [13]:
if not os.path.exists(f'{pwd}SimulationData/hg38.fa.gz'):
    urllib.request.urlretrieve(ucsc_hg38, f'{pwd}SimulationData/hg38.fa.gz')

## Import Sequence Data

In [14]:
# get hg38 reference with lambda phage control attached

hg38 = {}

with gzip.open(f'{pwd}SimulationData/hg38.fa.gz') as genome:
    contig_seq = ''
    chrom = None
    for line in io.BufferedReader(genome):
        processed_line = line.decode('utf-8').strip()
        if '>' == processed_line[0]:
            if chrom:
                hg38[chrom] = contig_seq
            contig_seq = ''
            chrom = processed_line[1:]
        else:
            contig_seq = contig_seq + processed_line
    hg38[chrom] = contig_seq

In [15]:
# import NYES and Mart vector sequences 
mart1 = 'pMSGV1-MART1TCR.txt'
nyes =  'pMSGV1-1G4_A_LY_RetroNYESO1.txt'

In [16]:
vector_seq = {}

for vector in [mart1, nyes]:
    vector_label = vector.replace('-', '_').replace('.txt', '')
    seq = ''
    with open(f'{pwd}SimulationData/{vector}', 'r') as vec_seq:
        for line in vec_seq:
            processed_line = line.strip().replace(' ', '')
            seq = seq + ''.join([base for base in processed_line if not base.isdigit()])
    vector_seq[vector_label] = seq
    

In [17]:
if not os.path.exists(f'{pwd}SimulationData/hg38_sim.fa'):
    out = open(f'{pwd}SimulationData/hg38_sim.fa', 'w')
    for chrom, seq in tqdm(hg38.items()):
        out.write(f'>{chrom}\n')
        out.write(f'{seq}\n')
    for chrom, seq in vector_seq.items():
        out.write(f'>{chrom}\n')
        out.write(f'{seq}\n')
    out.close()

## Simulated vector integration libraries
- select the number of integration events
- select random integration sites
- generate 2 mart and 2 nyes datasets

In [18]:
integration_events = [events + 15 for events in range(10)]

In [19]:
vector_integration_samples = ['pMSGV1_MART1TCR', 'pMSGV1_MART1TCR', 'pMSGV1_MART1TCR', 'pMSGV1_1G4_A_LY_RetroNYESO1', 'pMSGV1_1G4_A_LY_RetroNYESO1', 'pMSGV1_1G4_A_LY_RetroNYESO1']

In [20]:
simulation_integration_parameters = {}

for count, vector in enumerate(vector_integration_samples):
    sim_label = f'{count}_{vector}'
    int_events = random.sample(integration_events, 1)[0]
    integration_locations = []
    integration_sequences = []
    while len(integration_locations) < int_events:
        int_chrom = random.sample(list(hg38.keys()), 1)[0]
        if len(int_chrom) > 5 or not int_chrom[-1].isdigit():
            continue
        # get chromosome sequence
        chrom_seq = hg38[int_chrom]
        # select simulation position 
        int_pos = random.randint(0, len(chrom_seq))
        # retrieve integration position 
        left_seq = chrom_seq[int_pos - 1000: int_pos]
        right_seq = chrom_seq[int_pos: int_pos + 1000]
        integration_seq = left_seq + vector_seq[vector] + right_seq
        if 'N' in integration_seq or 'n' in integration_seq:
            continue
        integration_sequences.append(integration_seq)
        # save location
        integration_locations.append((int_chrom, int_pos))
    simulation_integration_parameters[sim_label] = integration_locations
    with open(f'{pwd}SimulationData/{sim_label}.fa', 'w') as sim:
        for location, seq in zip(integration_locations, integration_sequences):
            sim.write(f'>{location[0]}_{location[1]}\n')
            sim.write(f'{seq}\n')


In [21]:
def make_directory(directory_path):
    try:
        os.makedirs(directory_path, exist_ok=False)
    except FileExistsError:
        return None

In [22]:
simulation_output = f'{pwd}SimulationData/IntegrationSim/'
make_directory(simulation_output)

In [23]:
for sim_label in simulation_integration_parameters:
    meth_sim = SimulateMethylatedReads(reference_file=f'{pwd}SimulationData/{sim_label}.fa',
                                       art_path=art_path, 
                                       output_path=f'{simulation_output}{sim_label}',
                                       paired_end=True,
                                       read_length=150,
                                       read_depth=40,
                                       insertion_rate1=0.000, insertion_rate2=0.000,
                                       deletion_rate1=0.000, deletion_rate2=0.000,
                                       undirectional=False,
                                       pe_fragment_size=500)
    meth_sim.run_simulation()

Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation
Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation
Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation
Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation
Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation
Setting Cytosine Methylation
Simulating Illumina Reads
Simulating Methylated Illumina Reads
Finished Simulation


In [24]:
simulation_index = f'{pwd}SimulationData/SimulationIndex/'
if not os.path.exists(simulation_index):
    sim_index = WholeGenomeIndexBuild(reference_file=f'{pwd}SimulationData/hg38_sim.fa', genome_database=simulation_index, bowtie2_path=bt2_path, bowtie2_threads=10)
    sim_index.generate_bsb_database()

## Get control mapping sites for all reads
- get simulated mapping locations
- indentfiy simulated reads the span integration sites

In [25]:
def sam_iterator(sam_file):
    with open(sam_file, 'r') as sam:
        while True:
            line1 = sam.readline()
            if not line1.strip():
                break
            if line1[0] == '@':
                continue
            line2 = sam.readline()
            yield line1.strip().split('\t'), line2.strip().split('\t')

In [26]:
def assess_overlap(span, pos):
    if span[0] < pos < span[1]:
        return True
    return False

In [27]:
integration_reads = {sim_label: {} for sim_label in simulation_integration_parameters}

for sim_label in tqdm(simulation_integration_parameters):
    # need vector length to get the left and right vector boundaries 
    vector_length = len(vector_seq['_'.join(sim_label.split('_')[1:])])
    for line in sam_iterator(f'{simulation_output}{sim_label}.sam'):
        read_len_1 = get_mapping_length(convert_alpha_numeric_cigar(line[0][5]))
        read_len_2 = get_mapping_length(convert_alpha_numeric_cigar(line[1][5]))
        assert line[0][0] == line[1][0]
        assert line[0][2] == line[1][2]
        # normalize coordinates and convert types
        chromosome, vector_pos = line[0][2].split('_')
        vector_pos = int(vector_pos)
        read_1_pos = int(line[0][3])
        read_2_pos = int(line[1][3])
        read_span = [read_1_pos, read_2_pos + read_len_2]
        if line[0][1] == '83':
            read_span = [read_2_pos, read_1_pos + read_len_1]
        assert read_span[0] < read_span[1]
        vector_spans = (assess_overlap(read_span, 1000), assess_overlap(read_span, 1000 + vector_length))
        assert sum(vector_spans) <= 1
        if any(vector_spans):
            vector_boundary = 1000 if vector_spans[0] else 1000 + vector_length
            integration_label = None
            vector_label = None
            if assess_overlap((read_1_pos, read_1_pos + read_len_1), vector_boundary):
                vector_label = ('split_1', chromosome, vector_pos, read_1_pos - vector_boundary)
            elif assess_overlap((read_2_pos, read_2_pos + read_len_2), vector_boundary):
                vector_label = ('split_2', chromosome, vector_pos, read_2_pos - vector_boundary)
            else:
                if assess_overlap((1000, 1000 + vector_length), read_1_pos):
                    vector_label = ('discordant_1', chromosome, vector_pos, read_1_pos - vector_boundary, read_2_pos - vector_boundary)
                else:
                    vector_label = ('discordant_2', chromosome, vector_pos, read_1_pos - vector_boundary, read_2_pos - vector_boundary) 
            if vector_label:
                integration_reads[sim_label][line[0][0]] = vector_label

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [28]:
[len(x) for x in integration_reads.values()]

[2563, 3282, 2198, 2032, 2185, 2484]

Some Bowtie 2 options specify a function rather than an individual number or setting. In these cases the user specifies three parameters: (a) a function type F, (b) a constant term B, and (c) a coefficient A. The available function types are constant (C), linear (L), square-root (S), and natural log (G). The parameters are specified as F,B,A - that is, the function type, the constant term, and the coefficient are separated by commas with no whitespace. The constant term and coefficient may be negative and/or floating-point numbers.

For example, if the function specification is L,-0.4,-0.6, then the function defined is:

f(x) = -0.4 + -0.6 * x

If the function specification is G,1,5.4, then the function defined is:

f(x) = 1.0 + 5.4 * ln(x)

In [31]:
40 + 8 * np.log(150)

80.08508235277004

## Align Simulated Bisulfite Sequencing Data
- align simulated data with different minimun alignment scores 

In [32]:
score_min = ['40', '80', '120', '160', '200', '240', '280']

In [33]:
test_min = 'G,40,8'

In [35]:
# Align simulated data

sim_alignment_stats = {}
for min_score in tqdm(score_min):
    for sim_label in simulation_integration_parameters:
        bsbolt_alignment_command = ['python3', '-m', 'BSBolt', 'Align', '-BT2-p', '10', '-F1', f'{simulation_output}{sim_label}_meth_1.fastq', '-F2', f'{simulation_output}{sim_label}_meth_2.fastq',
                                    '-O', f'{simulation_output}{sim_label}_{min_score}', '-BT2-local', '-DB', simulation_index, '-discord', '-BT2-score-min', f'L,{min_score},0', '-S', '-BT2-X', '700']
        sim_align = subprocess.Popen(bsbolt_alignment_command, stdout=subprocess.PIPE, universal_newlines=True)
        alignment_stats = []
        for line in iter(sim_align.stdout.readline, ''):
            alignment_stats.append(line)
        sim_alignment_stats[f'{sim_label}_{min_score}'] = alignment_stats

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




In [67]:
?? get_spanning_reads

[0;31mSignature:[0m  [0mget_spanning_reads[0m[0;34m([0m[0mfile_path[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m [0mplasmid_names[0m[0;34m:[0m [0mset[0m [0;34m=[0m [0;32mNone[0m[0;34m)[0m [0;34m->[0m [0mdict[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mget_spanning_reads[0m[0;34m([0m[0mfile_path[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m [0mplasmid_names[0m[0;34m:[0m [0mset[0m [0;34m=[0m [0;32mNone[0m[0;34m)[0m [0;34m->[0m [0mdict[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mmapped_reads[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m
[0;34m[0m    [0;32mfor[0m [0msam_read[0m [0;32min[0m [0mstream_mapped_reads[0m[0;34m([0m[0mfile_path[0m[0;34m,[0m [0mincluded_flag[0m[0;34m=[0m[0;36m4[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mQNAME[0m[0;34m,[0m [0mFLAG[0m[0;34m,[0m [0mRNAME[0m[0;3

### Get Dicordant and Split Vector Reads

In [73]:
sample_spanning_reads = {}

for sim_label in tqdm(sim_alignment_stats):
    plasmid_reads = get_spanning_reads(f'{simulation_output}{sim_label}.sorted.bam', {'pMSGV1_MART1TCR', 'pMSGV1_1G4_A_LY_RetroNYESO1'})
    sample_spanning_reads[sim_label] = plasmid_reads

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




In [77]:
# clean reads mapping to different vector

for sim_label in tqdm(sim_alignment_stats):
    vector = '_'.join(sim_label.split('_')[1:-1])
    plasmid_reads = sample_spanning_reads[sim_label]
    cleaned_plasmid_reads = {}
    for read_name, read_group in plasmid_reads.items():
        cleaned_group = []
        vector_mapping = False
        for read in read_group:
            if read[2][0:3] == 'chr':
                cleaned_group.append(read)
            elif read[2] == vector:
                vector_mapping = True
                cleaned_group.append(read)
        if vector_mapping:
            cleaned_plasmid_reads[read_name] = cleaned_group
    sample_spanning_reads[sim_label] = cleaned_plasmid_reads

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




In [88]:
int_processor = ProcessVectorSpanningReads(multibase_threshold=0.2)

good_call = 0
bad_call = 0

for sim_label in tqdm(sim_alignment_stats):
    label = '_'.join(sim_label.split('_')[1:])
    for read_label, read_group in sample_spanning_reads[sim_label].items():
        control_info = integration_reads[label].get(read_label, False)
        called_int = int_processor.get_integration_sites(read_group, vector=vector)
        if called_int:
            print('yes')
        if control_info and called_int:
            good_call += 1
        elif not control_info and called_int:
            bad_call += 1

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))

KeyError: 'pMSGV1_1G4_A_LY_RetroNYESO1_80'

In [55]:
sum([len(x) for x in integration_reads.values()])

14744

In [56]:
bad_call

0