# Shift Retroviral Reference so they are split along ampicillin resistance sequence

In [1]:
import gzip
import io

In [2]:
wd = ''

In [3]:
nyes_ref = f'pMSGV1-1G4_A_LY_RetroNYESO1_sequence.txt'
mart_ref = f'pMSGV1-MART1TCR.txt'

In [4]:
vector_seq = {}

for vector in [nyes_ref, mart_ref]:
    vector_label = vector.replace('-', '_').replace('.txt', '')
    seq = ''
    with open(f'{wd}{vector}', 'r') as vec_seq:
        for line in vec_seq:
            processed_line = line.strip().replace(' ', '')
            seq = seq + ''.join([base for base in processed_line if not base.isdigit()])
    vector_seq[vector_label] = seq

In [5]:
ampr_nyes = (3748, 4407)

In [6]:
ampr_mart = (5653, 5653 + 659)

In [7]:
# shift reference sequence to split in the middle of the ampr sequence 
ampr_nyes_split = int((ampr_nyes[0] + ampr_nyes[1]) / 2)
ampr_mart_split = int((ampr_mart[0] + ampr_mart[1]) / 2)

In [8]:
def shift_string(string, split_point):
    right = string[split_point:]
    left = string[:split_point]
    return right + left

In [9]:
shifted_nyes = shift_string(vector_seq['pMSGV1_1G4_A_LY_RetroNYESO1_sequence'], ampr_nyes_split)
shifted_mart = shift_string(vector_seq['pMSGV1_MART1TCR'], ampr_mart_split)

In [27]:
hg38 = {}

with gzip.open('/u/nobackup/mcdbscratch/colinpat/Rebis/hg38_lambda.fa.gz', 'r') as genome:
    contig_seq = ''
    chrom = None
    for line in io.BufferedReader(genome):
        processed_line = line.decode('utf-8').strip()
        if '>' == processed_line[0]:
            if chrom:
                hg38[chrom] = contig_seq
            contig_seq = ''
            chrom = processed_line[1:]
        else:
            contig_seq = contig_seq + processed_line
    hg38[chrom] = contig_seq

In [28]:
with open('/u/nobackup/mcdbscratch/colinpat/Rebis/hg38_nyes_cut', 'w') as genome:
    for key, value in hg38.items():
        genome.write(f'>{key}\n')
        genome.write(f'{value}\n')
    genome.write('>nyes\n')
    genome.write(f'{shifted_nyes[1250:6001]}\n')

with open('/u/nobackup/mcdbscratch/colinpat/Rebis/hg38_mart', 'w') as genome:
    for key, value in hg38.items():
        genome.write(f'>{key}\n')
        genome.write(f'{value}\n')
    genome.write('>mart\n')
    genome.write(f'{shifted_mart}\n')

In [29]:
nyes_annotations =  {'SD': (6184, 6197),
                     '3_LTR': (1921, 2435),
                     '5_LTR': (5608, 6121),
                     'SA': (7133, 7157),
                     '1G4_alpha': (3, 824),
                     '1G4_beta': (906, 1841),
                     'P2A': (849, 905),
                     'Furin': (825, 836),
                     'SGSG': (837, 848),
                     'M13_fwd': (5183, 5200),
                     'ColE1_origin': (2968, 3650),
                     'LacZ_alpha': (5044, 5112),
                     'LacO': (2606, 2628),
                     'AmpR': (3748, 4407),
                     'MSCV_Prom_1': (1920, 2301),
                     'MSCV_Prom_2': (5606, 5987),
                     'Retro_NYESO1_F': (774, 794),
                     'Retro_NYESO1_R': (870, 884),
                     '2A_Junction_Probe': (822, 843),
                     'Primer_Cloning_1': (684, 704),
                     'Primer_Cloning_2': (955, 975)}

In [30]:
mart_annotations = {'5_LTR': (382, 515),
                    'MSCV_promoter_1': (1, 381),
                    'SD': (578, 591),
                    'SA': (1527, 1551),
                    'Furin': (2641, 2652),
                    'P2A': (2704, 2760),
                    'Mart1_TCR_alpha': (1822, 2703),
                    'MART1_TCR_b_chain': (2761, 3747), 
                    '3_LTR': (4207, 4340),
                    'MSCV_promoter_2': (3826, 4206),
                    'LacZ_alpha': (6949, 7017),
                    'LacO': (4511, 4533),
                    'AmpR': (5653, 6312)}       

In [31]:
ampr_mart_split

5982

In [32]:
# adjust annotation relative to the split point
def adjust_annotation(span, split_point, vector_len):
    adjusted_span = []
    for site in span:
        if site >= split_point:
            adjusted_span.append(site - split_point)
        else:
            shift_offset = vector_len - split_point
            adjusted_span.append(shift_offset + site)
    return (adjusted_span[0], adjusted_span[1])

In [33]:
adjusted_mart_annotation = {}

for annotation, span in mart_annotations.items():
    adjusted_mart_annotation[annotation] = adjust_annotation(span, ampr_mart_split, len(shifted_mart))

In [34]:
adjusted_nyes_annotation = {}

for annotation, span in nyes_annotations.items():
    adjusted_nyes_annotation[annotation] = adjust_annotation(span, ampr_nyes_split, len(shifted_nyes))