# Combine files for TWIST order

(c) 2021 Tom Röschinger. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

***

In this notebook we combine the individually prepared sequences for various experiments into the final twist order. Therefore we need to add orthogonal primers to the ends of all inserts. Also, we add additional reverse primers for different constructs within the experiments, so they can be amplified individually. Also, we might add restriction sites to the constructs to remove the primers from the transcript.

In [1]:
import wgregseq
%load_ext autoreload
%autoreload 2

import Bio
from Bio.SeqIO import parse
from Bio.Seq import Seq
from Bio.Restriction import *

from itertools import compress

import pandas as pd
import numpy as np
import copy

from ast import literal_eval as make_tuple

import glob

### Import

Import all `csv` files in the `data/twist_order` folder in this repo.

In [2]:
file_list = sorted(glob.glob("../../../data/twist_order/*.csv"))
file_list

['../../../data/twist_order/lacI_sequences.csv',
 '../../../data/twist_order/lacUV5_tetOx_single_double_mutants.csv',
 '../../../data/twist_order/lacUV_mutants.csv',
 '../../../data/twist_order/natural_tet_promoters_mutated.csv',
 '../../../data/twist_order/purR_twist_sequences.csv',
 '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_long.csv',
 '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_short.csv',
 '../../../data/twist_order/twist_orbit_TF_del_first_last_long.csv',
 '../../../data/twist_order/twist_orbit_TF_del_first_last_short.csv',
 '../../../data/twist_order/twist_site_scrambles.csv',
 '../../../data/twist_order/twist_site_scrambles_formatted.csv',
 '../../../data/twist_order/twist_sys_scrambles_10.csv',
 '../../../data/twist_order/twist_sys_scrambles_2_16.csv']

Import each data frame.

In [3]:
df_list = []
for file in file_list:
    df_list.append(pd.read_csv(file, index_col=0))

### Primers

We add a reverse primer to for each individual construct if there is space in the oligo. This enables us to amplify a certain subset if the constructs. For this purpose we simply use the `add_primers` function with the additional keyword argument `rev_only`. 

In [4]:
def do_all(file_list, add_primer, add_sec_primer, primer=100, sec_rev_primer=200, primer_skip_list=[]):
    df_list = []
    construct_id = 0
    # Iterate through files
    for i, file in enumerate(file_list):
        # Fix column headers
        df = pd.read_csv(file, index_col=0)
        if ('description' in df.columns) and ('construct' not in df.columns):
            df = df.rename(columns={'description' : "construct"})
        if ('name' in df.columns) and ('construct' not in df.columns):
            df = df.rename(columns={'name' : "construct"})
            
        seqs = np.array(df['seq'].values, dtype=object)

        # Add sequences to dataframe if no primers are added
        if not add_primer[i]:
            df['construct_ID'] = [construct_id] * len(seqs)
            forward_primers_0 = [(primer, 0)] * len(seqs)
            reverse_primers_0 = [(primer, len(seqs[0])+20)] * len(seqs)
            construct_id += 1
            df['seq'] = [seq + wgregseq.gen_rand_seq(200-len(seq)) for seq in df.seq]
            if 'reverse_primers_1' not in df.columns:
                df['reverse_primers_1'] =[(None, None)] * len(seqs)
            primer_columns = [x  for x in df.columns if ("primer" in x)]
            for col in primer_columns:
                if type(df[col][0]) == str:
                    df[col] = [make_tuple(x) for x in df[col]]
            df_list.append(df)
        # Add primer pairs
        elif not add_sec_primer[i]:
            forward_primers_0 = [(primer, 0)] * len(seqs)
            reverse_primers_0 = [(primer, len(seqs[0])+20)] * len(seqs)
            seqs = wgregseq.add_primers(seqs, primer, autocomplete=True)
            construct_id_column = [construct_id] * len(seqs)
            construct_column = df['construct'].values
            reverse_primers_1 = [(None, None)] * len(seqs)
            
            construct_id += 1
            df_list.append(
                pd.DataFrame(
                    {'seq': seqs, 
                     'forward_primers_0': forward_primers_0, 
                     'reverse_primers_0': reverse_primers_0, 
                     'reverse_primers_1': reverse_primers_1,
                     'construct_ID': construct_id_column,
                     'construct': construct_column
                    }))
            
        # Add individual construct primer and primer pairs
        else:
            seqs = wgregseq.add_primers(seqs, primer)
            for construct in df['construct'].unique():
                indices = np.asarray((df['construct'] == construct).values == True).nonzero()
                forward_primers_0 = [(primer, 0)] * np.size(indices[0])
                reverse_primers_0 = [(primer, len(seqs[indices][0])-20)] * np.size(indices[0])
                reverse_primers_1 = [(sec_rev_primer, len(seqs[indices][0]))] * np.size(indices[0])
                seqs[indices] = wgregseq.add_primers(seqs[indices], sec_rev_primer, rev_only=True, autocomplete=True)
                construct_id_column = [construct_id] * np.size(indices[0])
                construct_column = [construct] * np.size(indices[0])
                sec_rev_primer += 1
                
                while sec_rev_primer in primer_skip_list:
                    sec_rev_primer += 1
                construct_id += 1

                df_list.append(
                    pd.DataFrame(
                        {'seq': seqs[indices], 
                         'forward_primers_0': forward_primers_0, 
                         'reverse_primers_0': reverse_primers_0, 
                         'reverse_primers_1': reverse_primers_1,
                         'construct_ID': construct_id_column,
                         'construct': construct_column
                        }))
        
        primer += 1
        while primer in primer_skip_list:
            primer += 1
    return pd.concat(df_list, ignore_index=True)

In [5]:
def check_pool_df(df,enzymes):
    if wgregseq.check_primers_pool_df(df):
        print("Primer check passed.\n")
    else:
        print("Primer check DID NOT pass!\n")
    
    if any([len(seq) != 200 for seq in df.seq]):
        print("Oligo length check DID NOT pass!\n")
    else:
        print("Oligo length check passed.\n")
    print("-------------------")
    
    for name, sub_df in df.groupby("construct_ID"):
        print("Constructs: {}\n".format(np.unique(sub_df.construct)))
        rev_primer_columns = [x  for x in sub_df.columns if ("reverse_primer" in x)]
        end = np.max([x for rev_colum in rev_primer_columns for (_, x) in sub_df[rev_colum] if x!=None])
        fwd_primer_columns = [x  for x in sub_df.columns if ("forward_primer" in x)]
        start = np.max([x for fwd_colum in fwd_primer_columns for (_, x) in sub_df[fwd_colum] if x!=None])
        print("Forward Primers: {}.\n".format(np.unique([x for fwd_colum in fwd_primer_columns for (x, _) in sub_df[fwd_colum] if x!=None])))
        print("Reverse Primers: {}.\n".format(np.unique([x for rev_colum in rev_primer_columns for (x, _) in sub_df[rev_colum] if x!=None])))
        seqs = [seq[start:end+20] for seq in sub_df.seq]
        if len(np.unique(seqs)) != len(seqs):
            print("Uniqueness check DID NOT pass.")
            print("{} duplicated sequences. Check manually.\n".format(len(seqs) - len(np.unique(seqs))))
        else:
            print("Uniqueness check passed.\n")
        
    
        print("Restriction enzyme sites:")
        wgregseq.scan_enzymes_print(seqs, enzymes)
        print("----------------------------------------------------------")

In [6]:
df = do_all(
    ['../../../data/twist_order/lacI_sequences.csv', 
     '../../../data/twist_order/lacUV5_tetOx_single_double_mutants.csv',
     '../../../data/twist_order/lacUV_mutants.csv',
     '../../../data/twist_order/natural_tet_promoters_mutated.csv',
     '../../../data/twist_order/purR_twist_sequences.csv',
     '../../../data/twist_order/twist_site_scrambles_formatted.csv',
     '../../../data/twist_order/twist_sys_scrambles_2_16.csv',
     '../../../data/twist_order/twist_sys_scrambles_10.csv',
     '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_long.csv',
     '../../../data/twist_order/twist_orbit_TF_del_avd_ovlp_short.csv',
     '../../../data/twist_order/twist_orbit_TF_del_first_last_long.csv',
     '../../../data/twist_order/twist_orbit_TF_del_first_last_short.csv'
    ],
    add_sec_primer=[
        True, 
        True, 
        True, 
        True, 
        True, 
        False, 
        False, 
        False,
        False,
        False,
        False,
        False
    ],
    add_primer=[
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        False,
        False,
        False,
        False
    ]
    ,primer_skip_list=[101]
)

In [7]:
df.head()

Unnamed: 0,seq,forward_primers_0,reverse_primers_0,reverse_primers_1,construct_ID,construct
0,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
1,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
2,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
3,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant
4,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,"(100, 0)","(100, 82)","(200, 102)",0,lacUV5+O1_mutant


In [8]:
df.tail()

Unnamed: 0,seq,forward_primers_0,reverse_primers_0,reverse_primers_1,construct_ID,construct
26131,ttaatcttaggccccactggtTAAGGGCATCTGTTTTTTATATTCA...,"(390, 0)","(349, 149)","(None, None)",22,orbit_tf_del_FL_short
26132,ttaatcttaggccccactggtATATGAGTGTCGAATCCTTATCCAA...,"(390, 0)","(349, 149)","(None, None)",22,orbit_tf_del_FL_short
26133,ttaatcttaggccccactggtAGCCATGCACCGTAGACCAGATAAG...,"(390, 0)","(349, 149)","(None, None)",22,orbit_tf_del_FL_short
26134,ttaatcttaggccccactggtGGTTATTTAACGGCGCGAGTGTAAT...,"(390, 0)","(349, 149)","(None, None)",22,orbit_tf_del_FL_short
26135,ttaatcttaggccccactggtGGTAAAGTAAGGACATTCTTAACCC...,"(390, 0)","(349, 149)","(None, None)",22,orbit_tf_del_FL_short


In [9]:
check_pool_df(df, ["SacI", "SalI", "XhoI", "SbfI", "ApaI"])

Primer check passed.

Oligo length check passed.

-------------------
Constructs: ['lacUV5+O1_mutant']

Forward Primers: [100].

Reverse Primers: [100 200].

Uniqueness check passed.

Restriction enzyme sites:
XhoI :  1.0
SalI :  1.0
SacI :  1.0
ApaI :  0.0
SbfI :  0.0
----------------------------------------------------------
Constructs: ['lacUV5+O1']

Forward Primers: [100].

Reverse Primers: [100 201].

Uniqueness check DID NOT pass.
4 duplicated sequences. Check manually.

Restriction enzyme sites:
ApaI :  0.0
SbfI :  0.0
XhoI :  0.0
SalI :  0.0
SacI :  0.0
----------------------------------------------------------
Constructs: ['lacUV5+O2']

Forward Primers: [100].

Reverse Primers: [100 202].

Uniqueness check DID NOT pass.
4 duplicated sequences. Check manually.

Restriction enzyme sites:
ApaI :  0.0
SbfI :  0.0
XhoI :  0.0
SalI :  0.0
SacI :  0.0
----------------------------------------------------------
Constructs: ['lacUV5+O3']

Forward Primers: [100].

Reverse Primers: [100 2

In [12]:
df_to_file = df[['seq', 'construct']]
df_to_file

Unnamed: 0,seq,construct
0,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,lacUV5+O1_mutant
1,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,lacUV5+O1_mutant
2,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,lacUV5+O1_mutant
3,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,lacUV5+O1_mutant
4,GCTTATTCGTGCCGTGTTATTCGAGTTTACACTTTATGCTTCCGGC...,lacUV5+O1_mutant
...,...,...
26131,ttaatcttaggccccactggtTAAGGGCATCTGTTTTTTATATTCA...,orbit_tf_del_FL_short
26132,ttaatcttaggccccactggtATATGAGTGTCGAATCCTTATCCAA...,orbit_tf_del_FL_short
26133,ttaatcttaggccccactggtAGCCATGCACCGTAGACCAGATAAG...,orbit_tf_del_FL_short
26134,ttaatcttaggccccactggtGGTTATTTAACGGCGCGAGTGTAAT...,orbit_tf_del_FL_short
