# Combine files for TWIST order

(c) 2021 Tom Röschinger. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

***

In this notebook we combine the individually prepared sequences for various experiments into the final twist order. Therefore we need to add orthogonal primers to the ends of all inserts. Also, we add additional reverse primers for different constructs within the experiments, so they can be amplified individually. Also, we might add restriction sites to the constructs to remove the primers from the transcript.

In [10]:
import wgregseq
%load_ext autoreload
%autoreload 2

import Bio
from Bio.SeqIO import parse
from Bio.Seq import Seq
from Bio.Restriction import *

from itertools import compress

import pandas as pd
import numpy as np
import copy

from ast import literal_eval as make_tuple

import glob

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import

Import all `csv` files in the `data/twist_order` folder in this repo.

In [11]:
file_list = sorted(glob.glob("../../../data/twist_order/*.csv"))
file_list

['../../../data/twist_order/lacI_sequences.csv',
 '../../../data/twist_order/lacUV5_tetOx_single_double_mutants.csv',
 '../../../data/twist_order/lacUV_mutants.csv',
 '../../../data/twist_order/natural_tet_promoters_mutated.csv',
 '../../../data/twist_order/purR_twist_sequences.csv',
 '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_long.csv',
 '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_short.csv',
 '../../../data/twist_order/twist_orbit_TF_del_first_last_long.csv',
 '../../../data/twist_order/twist_orbit_TF_del_first_last_short.csv',
 '../../../data/twist_order/twist_site_scrambles.csv',
 '../../../data/twist_order/twist_site_scrambles_formatted.csv',
 '../../../data/twist_order/twist_sys_scrambles_10.csv',
 '../../../data/twist_order/twist_sys_scrambles_2_16.csv']

Import each data frame.

In [12]:
df_list = []
for file in file_list:
    df_list.append(pd.read_csv(file, index_col=0))

### Primers

We add a reverse primer to for each individual construct if there is space in the oligo. This enables us to amplify a certain subset if the constructs. For this purpose we simply use the `add_primers` function with the additional keyword argument `rev_only`. 

In [13]:
def do_all(file_list, add_primer, add_sec_primer, primer=100, sec_rev_primer=200, primer_skip_list=[]):
    df_list = []
    construct_id = 0
    # Iterate through files
    for i, file in enumerate(file_list):
        # Fix column headers
        df = pd.read_csv(file, index_col=0)
        if ('description' in df.columns) and ('construct' not in df.columns):
            df = df.rename(columns={'description' : "construct"})
        if ('name' in df.columns) and ('construct' not in df.columns):
            df = df.rename(columns={'name' : "construct"})
            
        seqs = np.array(df['seq'].values, dtype=object)

        # Add sequences to dataframe if no primers are added
        if not add_primer[i]:
            df['construct_ID'] = [construct_id] * len(seqs)
            forward_primers_0 = [(primer, 0)] * len(seqs)
            reverse_primers_0 = [(primer, len(seqs[0])+20)] * len(seqs)
            construct_id += 1
            df['seq'] = seqs #[seq + wgregseq.gen_rand_seq(200-len(seq)) for seq in df.seq]
            if 'reverse_primers_1' not in df.columns:
                df['reverse_primers_1'] =[(None, None)] * len(seqs)
            primer_columns = [x  for x in df.columns if ("primer" in x)]
            for col in primer_columns:
                if type(df[col][0]) == str:
                    df[col] = [make_tuple(x) for x in df[col]]
            df_list.append(df)
        # Add primer pairs
        elif not add_sec_primer[i]:
            forward_primers_0 = [(primer, 0)] * len(seqs)
            reverse_primers_0 = [(primer, len(seqs[0])+20)] * len(seqs)
            seqs = wgregseq.add_primers(seqs, primer, autocomplete=False)
            construct_id_column = [construct_id] * len(seqs)
            construct_column = df['construct'].values
            reverse_primers_1 = [(None, None)] * len(seqs)
            
            construct_id += 1
            df_list.append(
                pd.DataFrame(
                    {'seq': seqs, 
                     'forward_primers_0': forward_primers_0, 
                     'reverse_primers_0': reverse_primers_0, 
                     'reverse_primers_1': reverse_primers_1,
                     'construct_ID': construct_id_column,
                     'construct': construct_column
                    }))
            
        # Add individual construct primer and primer pairs
        else:
            seqs = wgregseq.add_primers(seqs, primer)
            for construct in df['construct'].unique():
                indices = np.asarray((df['construct'] == construct).values == True).nonzero()
                forward_primers_0 = [(primer, 0)] * np.size(indices[0])
                reverse_primers_0 = [(primer, len(seqs[indices][0])-20)] * np.size(indices[0])
                reverse_primers_1 = [(sec_rev_primer, len(seqs[indices][0]))] * np.size(indices[0])
                seqs[indices] = wgregseq.add_primers(seqs[indices], sec_rev_primer, rev_only=True, autocomplete=False)
                construct_id_column = [construct_id] * np.size(indices[0])
                construct_column = [construct] * np.size(indices[0])
                sec_rev_primer += 1
                
                while sec_rev_primer in primer_skip_list:
                    sec_rev_primer += 1
                construct_id += 1

                df_list.append(
                    pd.DataFrame(
                        {'seq': seqs[indices], 
                         'forward_primers_0': forward_primers_0, 
                         'reverse_primers_0': reverse_primers_0, 
                         'reverse_primers_1': reverse_primers_1,
                         'construct_ID': construct_id_column,
                         'construct': construct_column
                        }))
        
        primer += 1
        while primer in primer_skip_list:
            primer += 1
        print("File {} done.".format(file))
    return pd.concat(df_list, ignore_index=True)

In [14]:
df = do_all(
    ['../../../data/twist_order/lacI_sequences.csv', 
     '../../../data/twist_order/lacUV5_tetOx_single_double_mutants.csv',
     '../../../data/twist_order/lacUV_mutants.csv',
     '../../../data/twist_order/natural_tet_promoters_mutated.csv',
     '../../../data/twist_order/purR_twist_sequences.csv',
     '../../../data/twist_order/twist_site_scrambles_formatted.csv',
     '../../../data/twist_order/twist_sys_scrambles_2_16.csv',
     '../../../data/twist_order/twist_sys_scrambles_10.csv',
     '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_long.csv',
     '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_short.csv',
     '../../../data/twist_order/twist_orbit_TF_del_first_last_long.csv',
     '../../../data/twist_order/twist_orbit_TF_del_first_last_short.csv'
    ],
    add_sec_primer=[
        True, 
        True, 
        True, 
        True, 
        True, 
        False, 
        False, 
        False,
        False,
        False,
        False,
        False
    ],
    add_primer=[
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        False,
        False,
        False,
        False
    ]
    ,primer_skip_list=[101]
)

File ../../../data/twist_order/lacI_sequences.csv done.
File ../../../data/twist_order/lacUV5_tetOx_single_double_mutants.csv done.
File ../../../data/twist_order/lacUV_mutants.csv done.
File ../../../data/twist_order/natural_tet_promoters_mutated.csv done.
File ../../../data/twist_order/purR_twist_sequences.csv done.
File ../../../data/twist_order/twist_site_scrambles_formatted.csv done.
File ../../../data/twist_order/twist_sys_scrambles_2_16.csv done.
File ../../../data/twist_order/twist_sys_scrambles_10.csv done.
File ../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_long.csv done.
File ../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_short.csv done.
File ../../../data/twist_order/twist_orbit_TF_del_first_last_long.csv done.
File ../../../data/twist_order/twist_orbit_TF_del_first_last_short.csv done.


In [15]:
df.head()

Unnamed: 0,seq,forward_primers_0,reverse_primers_0,reverse_primers_1,construct_ID,construct
0,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
1,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
2,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
3,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
4,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant


In [16]:
df.tail()

Unnamed: 0,seq,forward_primers_0,reverse_primers_0,reverse_primers_1,construct_ID,construct
32329,ccgtagataacacaacgcagtgctTAAGGGCATCTGTTTTTTATAT...,"(469, 0)","(349, 152)","(None, None)",22,orbit_TF_del_first_last_short
32330,ccgtagataacacaacgcagtgctATATGAGTGTCGAATCCTTATC...,"(469, 0)","(349, 152)","(None, None)",22,orbit_TF_del_first_last_short
32331,ccgtagataacacaacgcagtgctAGCCATGCACCGTAGACCAGAT...,"(469, 0)","(349, 152)","(None, None)",22,orbit_TF_del_first_last_short
32332,ccgtagataacacaacgcagtgctGGTTATTTAACGGCGCGAGTGT...,"(469, 0)","(349, 152)","(None, None)",22,orbit_TF_del_first_last_short
32333,ccgtagataacacaacgcagtgctGGTAAAGTAAGGACATTCTTAA...,"(469, 0)","(349, 152)","(None, None)",22,orbit_TF_del_first_last_short


In [17]:
def check_pool_df(df,enzymes):
    message = ""
    if wgregseq.check_primers_pool_df(df):
        message += "Primer check passed.\n\n"
    else:
        message += "Primer check DID NOT pass!\n\n"
        
    message +="---------------------------------------------\n \n"
    
    for name, sub_df in df.groupby("construct_ID"):
        
        message +="Constructs: {}\n\n".format(np.unique(sub_df.construct))
        oligo_lengths = np.unique([len(x) for x in sub_df.seq])
        if len(oligo_lengths) == 1:
            message += "Oligo length check passed! Length: {}\n\n".format(oligo_lengths[0])
        else:
            message += "Oligo length check DID NOT pass. Lengths: {}\n\n".format(oligo_lengths)
            
        rev_primer_columns = [x  for x in sub_df.columns if ("reverse_primer" in x)]
        end = np.max([x for rev_colum in rev_primer_columns for (_, x) in sub_df[rev_colum] if x!=None])
        fwd_primer_columns = [x  for x in sub_df.columns if ("forward_primer" in x)]
        start = np.max([x for fwd_colum in fwd_primer_columns for (_, x) in sub_df[fwd_colum] if x!=None])
        message += "Forward Primers: {}.\n".format(np.unique([x for fwd_colum in fwd_primer_columns for (x, _) in sub_df[fwd_colum] if x!=None]))
        message += "Reverse Primers: {}.\n".format(np.unique([x for rev_colum in rev_primer_columns for (x, _) in sub_df[rev_colum] if x!=None]))
        seqs = [seq[start:end+20] for seq in sub_df.seq]
        if len(np.unique(seqs)) != len(seqs):
            message += "Uniqueness check DID NOT pass.\n"
            message += "{} duplicated sequences. Check manually.\n".format(len(seqs) - len(np.unique(seqs)))
        else:
            message += "Uniqueness check passed.\n"
        
    
        message += "Restriction enzyme sites:\n"
        message += wgregseq.scan_enzymes_print(seqs, enzymes)+"\n"
        message +="---------------------------------------------\n \n"
    print(message)
    with open("oligo_pool_check.txt", "w") as text_file:
        text_file.write(message)

In [None]:
check_pool_df(df, ["SacI", "SalI", "XhoI", "SbfI", "ApaI", "NheI", "BsaI"])

In [None]:
df_to_file = df[['seq', 'construct']]
df_to_file

In [None]:
df.tail()