# Generate mutated sequences

In [1]:
import numpy as np
import pandas as pd


from Bio.Seq import Seq
import Bio.SeqIO
import Bio.Seq as Seq
import RegSeq

# import sortseq.utils as utils
import mpathic.utils as utils
seq_dict,inv_dict = utils.choose_dict('dna')


We load the sequences we gerated in the `create_gene_seqs.ipynb` notebook.

In [11]:
df = pd.read_csv('../data/test_data/wtsequences.csv')
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,name,start_site,rev,geneseq,ssdiff,offset
0,0,fdoH,4085867.0,rev,CATTATGGTATTCTGTTACAAACCCTTCCTGGATGGAGGGAAATTGAGCCAATTCTGGACCTTTGCGGCCCCTTCCGCAAAGAAAAATAACTCCCACTCCCTGCACACGCAGCAAGCGAATGTAAATGGGACGTGACAATGTCGAAACAAGGAGCAATCC,0.0,0.0
1,3,sdaB,2928035.0,fwd,TACATATATTGCGCGCCCCGGAAGAAGTCAGATGTCGTTTAATGGGCAAATATTGCCCTTAAATTCTCTTTTACTTTTGATTTACAGAGTAAAGCGTTGGGATAATCTATCTTCCAAGTAGATTATTGTATTTGAGATCAAGATCACTGATAGATACATA,0.0,0.0
2,6,thiM,2185451.0,rev,TCTGGATGTCGTTCTGAAGGTGCTGGATTCATATATCAAATAATTTATTAACGCGATTGTAAAACTGCCGTTTTTCCTCGTTTACAACGCGTGCGCTGGACATTACCATCCTCCTCTGCGATTTATCATCGCAACCAAACGACTCGGGGTGCCCTTCTGC,0.0,0.0
3,7,yedJ,2033449.0,rev,TTTTTCCTGTATTCACTGCCGTTGCGCAAAATTTATCTATTTGTTCAAAAAATGATTGAGTCTTGACTGGCTCATCCAATGTGGAAAAATGTGACTTTTATCACATAATAGTACTAAGTCTGAATTTTCCGGGTTATCTCAAAATGGAATACGGTTCGAC,0.0,0.0
4,9,ykgE,321511.0,fwd,TCGATTTCCCCATAAAATGTGAGCGATGCCGAAAGAAATAAAATTAGTTATTGCATTTGACGTTTGGATGAAAGATTATCATTTGTCATACAAATGAGGGGTGGTATGTTGCTAGCCAATTAAAAAAGAACGCCATATTTATTGATGATTGATCCCCCGG,0.0,0.0
...,...,...,...,...,...,...,...
102,27,znuA,1942661.0,rev,TTGGCCCAAGTAAAGTCAAAATTTTTCCAGGTTTAAGTTCCAGCGACACATCAGAGAGGACGCGGCGTTGGCCAAAAGAAACCGAGACATTTTCCAGGGAAACCAGACTTGTCATGTTAATTTTAGTCTTGCAGTAGTCATGAAATGTTATAATATCACA,0.0,0.0
103,28,zupT,3182433.0,fwd,TGCCAGCTGCGGGTATACAAATTATCTTCCAGCACGTTCATCGGACTACCGTGACCTAAAAACAATGCTGGCATACGTGTTGAAGACATGATGATATCCTTAACTAAAGGTGTCATTTTGATATCCTCACAATACGCTTGTTCGGCGGAGTAAGAACCCG,0.0,0.0
104,29,pitA,3637612.0,fwd,TGCCTGAATTATATAAGATAATTATTTTTTGAGTGAAATCCATACAGGGGGCAAATCAAAAAAAGTCTATATTTCACTTTGCCCGCGCCGCGAAAGTCACTGATAATGCGCCGCGTTCATGTCCTCAAAATGGCGTAACGTCCTATGCTACATTTGTTTG,0.0,0.0
105,30,ecnB,4376509.0,fwd,GAAGACATCAAACATCTCGGCAACTCCATCTCTCGCGCTGCCAGCTAATTTTTCTTCTCTTCCGAAAAATCATCAGATTCCCATCATTTTTGGCGATGTTGTCTATTATTAATTTGCTATAGGCAAACATAAATAACATTACCTAAAAGGAAGACGTTAT,0.0,0.0


Load in primers provided to us by the Kosuri group.

In [13]:
kosprimefwd = Bio.SeqIO.parse('../data/primers/forward_finalprimers.fasta','fasta')
kosprimerev = Bio.SeqIO.parse('../data/primers/reverse_finalprimers.fasta','fasta')

Extract the 19 primers from the list.

In [14]:
fwdprimes, revprimes = RegSeq.utils.get_primers(kosprimefwd, kosprimerev, 100, 118)

Determine total number of genes we are ordering.

In [15]:
ngenes = len(df.index)
ngenes

103

Determine number of sequences to order

In [16]:
norder = 150000
nseqs = int(np.floor(norder/ngenes)) - 1
nseqs

1455

Now we do mutations and create an output data frame with each mutant sequence and associated gene.

In [17]:
allseqs = RegSeq.utils.mutation_sequences(df, fwdprimes, revprimes, nseqs)

Let's store the result in a csv file.

In [18]:
pd.set_option('max_colwidth',int(1e8))
allseqs.to_string(
        open('mutatedseqs_test','w'), index=False,col_space=10,float_format=utils.format_string)

Now we can evaluate the resulting sequences to check for a good mutation rate.

In [19]:
allseqs = pd.io.parsers.read_csv('mutatedseqs_test',delim_whitespace=True)
RegSeq.utils.check_mutation_rate(df, allseqs, buffer=10)

0.019230769230769232
0.019230769230769232
0.019917582417582416
0.018543956043956044
Bad mutation rate for gene yicI.
0.019917582417582416


KeyboardInterrupt: 