In [1]:
from __future__ import division
import numpy as np
import pandas as pd

import mpathic.utils as utils
from Bio.Seq import Seq
import Bio.SeqIO
import Bio.Seq as Seq
import mpathic.simulate_library as simulate_library
import mpathic.profile_mut as profile_mut
import mpathic.profile_freq as profile_freq
seq_dict,inv_dict = utils.choose_dict('dna')


ModuleNotFoundError: No module named 'mpathic'

In [3]:
def mut_seq_random(s,fwdprimer,revprimer):
    '''mutate sequence at 10 percent rate and add primers to each side of the sequence.'''
    
    #need to convert wild type sequence to list for input. 
    s_temp = list(s)
    #mutate interal 140 bp.
    firstbase = 10
    finalbase = 150
    #use mpathic function to generate randomly mutated library
    seqs_df = simulate_library.main(wtseq=s[firstbase:finalbase],numseq=nseqs)
    #We don't want duplicates, so we'll only take independent sequences.
    seqs_df = utils.collapse_further(seqs_df)
    seqs = list(seqs_df['seq'])
    #make sure we have wild type sequence in the list
    outseqs = [fwdprimer + s + revprimer]  + [fwdprimer + s[:firstbase] + str(x) + s[finalbase:] + revprimer  for x in seqs]
    outdf = pd.DataFrame()
    outdf['seq'] = outseqs
    return outdf

In [5]:
df = pd.io.parsers.read_csv('../data/test_data/wtsequences.csv')

In [6]:
#load in primers provided to us by the kosuri group.
kosprimefwd = Bio.SeqIO.parse('../data/primers/forward_finalprimers.fasta','fasta')
kosprimerev = Bio.SeqIO.parse('../data/primers/reverse_finalprimers.fasta','fasta')

In [7]:
#we want 19 primer pairs, we will take 100 to 119 in the associated files.
fwdprimes = []
for i,record in enumerate(kosprimefwd):
    if i >= 100 and i < 120:
        prime = str(record.seq)
        fwdprimes.append(prime)

In [8]:
revprimes = []
for i,record in enumerate(kosprimerev):
    if i >= 100 and i < 120:
        prime = str(record.seq.reverse_complement())
        revprimes.append(prime)

In [11]:
#determine total number of genes we are ordering.
ngenes = len(df.index)
ngenes

107

In [12]:
#determine number of sequences to order
norder = 150000
nseqs = int(np.floor(norder/ngenes)) - 1
nseqs

1400

In [13]:
#now we do mutations and create an output data frame with each mutant sequence
#and associated gene.
allseqs = pd.DataFrame()
for i,row in df.iterrows():
    #we group the genes into 18 groups with different primers, we need to establish
    #which group we are in.
    primernum = int(np.floor(i/6))
    #get fwd and rev primer.
    thefwdprimer = fwdprimes[primernum]
    therevprimer = revprimes[primernum]
    #mutate the sequence
    tempdf =  mut_seq_random(row['geneseq'],thefwdprimer,therevprimer)
    #set which gene the group of sequences comes from.
    tempdf['gene'] = row['name']
    #we will build up the final dataframe of mutated sequences from the individual
    #dataframes of sequences. 
    allseqs = pd.concat([allseqs,tempdf],axis=0)

In [14]:
#save the output.
pd.set_option('max_colwidth',int(1e8))
allseqs.to_string(
        open('mutatedseqs_test','w'), index=False,col_space=10,float_format=utils.format_string)

In [15]:
allseqs

Unnamed: 0,seq,gene
0,GCTTATTCGTGCCGTGTTATCATTATGGTATTCTGTTACAAACCCTTCCTGGATGGAGGGAAATTGAGCCAATTCTGGACCTTTGCGGCCCCTTCCGCAAAGAAAAATAACTCCCACTCCCTGCACACGCAGCAAGCGAATGTAAATGGGACGTGACAATGTCGAAACAAGGAGCAATCCGGGCACAGCAATCAAAAGTA,fdoH
1,GCTTATTCGTGCCGTGTTATCATTATGGTAACCTGCTTCGAACCTTTCCTGGATGGAGGGGAATTGAGCCGATTCTGGACCTTCGCGGCCCCTTCCGCAAAGACAAATAATTCCCACTCCCTGCACACGCAGCAAGGGAATGCAAATTGGACTTGACAATTTTGAAACAAGGAGCAATCCGGGCACAGCAATCAAAAGTA,fdoH
2,GCTTATTCGTGCCGTGTTATCATTATGGTAACCTGTTACAAAAACTTCCTGGATGGAGGGAAATTGAGCCAAGTCTGGACCTTTGCGGCCCCTTGCGCGAAGAAAAAGAACTCCCACTCCCTGCACACGCAGCAAGCGAATGTATATGGGACGTGACAGTGTTGAAACAAGGAGCAATCCGGGCACAGCAATCAAAAGTA,fdoH
3,GCTTATTCGTGCCGTGTTATCATTATGGTAACCTGTTACAAACCCTTCCTGGATGGAGGGAAATTGAGCCAATTCTGGACCTTTGCGGCCCCTTCCGCAAAGAAAAATAACTCCCGCTACCTGCACACGCAGCAAGCGAATGTAAATGGGACATGACAATGTCGAAGGAAGGAGCAATCCGGGCACAGCAATCAAAAGTA,fdoH
4,GCTTATTCGTGCCGTGTTATCATTATGGTAACCTGTTACAGACCCTTCCTGAATGGATCGTAAATTAGGCATTTCTGGACCTCTGCGCCCCCTTTCGCAAAGAAAAATAACTCCTACTCCCCGCATACGCAGACCTCCAATGTAAATGGGACGGGACAATGTCGAAACAAGGAGCAATCCGGGCACAGCAATCAAAAGTA,fdoH
...,...,...
1396,TCTTAGAAATCCACGGGTCCACTTTAACCATCGCTGTACAACTAGCTAATTTTACGGATGCAGAAGTAACGCTGGCGGGCCATTTTTTTAGCGTCAGAGTTGAGATCCGCTTTTGTATCCAGTAACACTCAAAGCATACCGCATTCCTCTGGAGCTGATTTAATGACTCACATCGTTCGCGGCACCAGGTACATATCTCA,leuABCD
1397,TCTTAGAAATCCACGGGTCCACTTTAACCATCTTAGCCCAATTAGCTAATTTTACGTAAGCAGAACTCACGCTGGCGGGACATTTTTATCGCGTCAAGATTGACCCCCGGTTTTGTCTCCAGAAACTTTAAAACCATAGCGCATTCATCTGAAGCTGATTTAATGACGCCCATCGTTCGCGGCACCAGGTACATATCTCA,leuABCD
1398,TCTTAGAAATCCACGGGTCCACTTTAACCATGGCAGCACAATTAATTGATTTTACGGAAGCAGAACTCACGCTGGCGGGACGTTTTTATAGCGTCAGGGTTGACATCCGTTTTTATATCCAGTAACTCTAAATGCATTTCGCATTCATCTGGATCTGATCCAAAGACTGACATCGTTCGCGGCACCAGGTACATATCTCA,leuABCD
1399,TCTTAGAAATCCACGGGTCCACTTTAACCATGGCCGCACAAATAGCTAATTTTACGCATGCAGAACTCACGCTGGTAGGACGGTTTTATTGCGTCAGGGTTGGCATCCGTTTTTGTATCCCGTAACTCTAAAAGTATATCGCAGTCACCTGGAACTTATTTAATGACTGACATCGTTCGCGGCACCAGGTACATATCTCA,leuABCD


now we can evaluate the resulting sequences to check for a good mutation rate.

In [16]:
allseqs = pd.io.parsers.read_csv('mutatedseqs_test',delim_whitespace=True)

In [17]:
#now lets check each gene.
buffer = 10
for i,row in df.iterrows():
    gene = row['name']
    #dnaE gene is bugged out right now, just skip it for the moment.
    if gene == 'dnaE':
        continue
    #only look at the part of the mutated genes df that are from the target gene.
    partialdf = allseqs.loc[allseqs['gene'] == gene,['seq']]
    
    #for the purposes of the evalution we will asign a test count of 1 for each
    #sequence, this needs to be formatted that way to use the mpathic functions.
    partialdf['ct'] = 1
    partialdf = partialdf[['ct','seq']]
    partialdf = partialdf.reset_index(drop=True)
    
    #we can now check mutation rate of each sequence.
    mut = profile_mut.main(partialdf)
    
    #look at mutation rate of non-primer and non-wildtype bases (we have a buffer of
    #wildtype bases of 10 on both sides.)
    relevantmuts = mut.loc[buffer + 20:179-buffer*2,'mut']
    
    #now we check to be sure that each position has a mutation rate within target
    #parameters
    if (relevantmuts > .14).any() or (relevantmuts < .07).any():
        print('bad mutation rate!')
    #do some checking to make sure we don't have repeat sequences.
    if len(partialdf['seq'].index) != len(set(partialdf['seq'])):
        print('repeat')
    #now we can look at the frequency of each base in the sequence range
    freqmat = profile_freq.main(partialdf)
    relevantfreq = freqmat.loc[20 + buffer:179-buffer*2,:]
    #now we will check to make sure that each of the individual bases don't have
    #abnormally low mutation rates.
    freqmin = relevantfreq[['freq_A','freq_C','freq_G','freq_T']].min(axis=1)
    relevantfreqmat = np.array(relevantfreq[['freq_A','freq_C','freq_G','freq_T']])
    if (freqmin < .02).any():
        print(freqmin.min())
    

0.019271948608137045
0.014989293361884369
0.017130620985010708
0.019271948608137045
0.017844396859386154
0.01998572448251249
0.01998572448251249
0.01998572448251249
0.018558172733761598


KeyboardInterrupt: 