In [26]:
#Primer3 primer generator
#this program inputs a list of genes and uses primer3 to generate a list of primers for each gene

#Here's an example of one entry in the input to primer3
'''
SEQUENCE_ID=example
SEQUENCE_TEMPLATE=GTAGTCAGTAGACNATGACNACTGACGATGCAGACNACACACACACACACAGCACACAGGTATTAGTGGGCCATTCGATCCCGACCCAAATCGATAGCTACGATGACG
SEQUENCE_TARGET=37,21
PRIMER_TASK=generic
PRIMER_PICK_LEFT_PRIMER=1
PRIMER_PICK_INTERNAL_OLIGO=1
PRIMER_PICK_RIGHT_PRIMER=1
PRIMER_OPT_SIZE=18
PRIMER_MIN_SIZE=15
PRIMER_MAX_SIZE=21
PRIMER_MAX_END_GC=2
PRIMER_MAX_GC=50
PRIMER_SALT_DIVALENT=2
PRIMER_SALT_MONOVALENT=100
PRIMER_MAX_NS_ACCEPTED=1
PRIMER_PRODUCT_SIZE_RANGE=75-100
P3_FILE_FLAG=1
SEQUENCE_INTERNAL_EXCLUDED_REGION=37,21
PRIMER_EXPLAIN_FLAG=1
=
'''

numpmrs = 100 #number of primers to pick

settings = f"\
PRIMER_TASK=generic\n\
PRIMER_PICK_LEFT_PRIMER=1\n\
PRIMER_PICK_INTERNAL_OLIGO=0\n\
PRIMER_PICK_RIGHT_PRIMER=1\n\
PRIMER_OPT_TM=60\n\
PRIMER_MIN_TM=55\n\
PRIMER_MAX_TM=63\n\
PRIMER_OPT_SIZE=20\n\
PRIMER_MIN_SIZE=18\n\
PRIMER_MAX_SIZE=25\n\
PRIMER_MAX_END_GC=2\n\
PRIMER_MAX_GC=50\n\
PRIMER_SALT_DIVALENT=2\n\
PRIMER_SALT_MONOVALENT=100\n\
PRIMER_PRODUCT_SIZE_RANGE=400-500\n\
PRIMER_NUM_RETURN={numpmrs}\n\
PRIMER_EXPLAIN_FLAG=0\n\
P3_FILE_FLAG=0\n\
="

pmr3 = "~/primer3/src/primer3_core"


In [44]:
import os
os.chdir("/home/flan3@ad.wisc.edu/MplexPrimerPicker/projects/spikein")
os.listdir()


['targets.fa']

In [45]:
#time ito import the target genes
from Bio import SeqIO

#import the TARGETs to get primers for (in fasta format)
targets = list(SeqIO.parse("targets.fa", "fasta"))

#OPTIONAL: screen out the targets we dont' want
#remove = ["tet(C)", "tet(W/N/W)", "tetA(P)", "tetB(P)"]
remove = []

keeplist = []
for g in targets:
    found = False
    for r in remove:
        if g.name.find(r) != -1: #if its not -1 then it is found
            found = True
            break
    if found == False:
        keeplist.append(g)

keeplist
           

[SeqRecord(seq=Seq('TAAGATTGGTTGTCCTGTTATTGACGTTTCAGATAAAGCCATTGAAGAAACAGC...TTT', SingleLetterAlphabet()), id='Sepiderm-dnaG', name='Sepiderm-dnaG', description='Sepiderm-dnaG', dbxrefs=[]),
 SeqRecord(seq=Seq('GGCTTCATCGGTGTCGGCTTCCGCCAACAGAAGGGCATCCGCATCCGGAGCACT...GAA', SingleLetterAlphabet()), id='Pputida-dnaG', name='Pputida-dnaG', description='Pputida-dnaG', dbxrefs=[])]

In [46]:
#make the BOULDER-IO entries for each gene, which is the format that primer3 needs
inputs = []
for t in keeplist:
    seq = str(t.seq).replace("-", '') #strip the "-" because primer3 can't handle it
    seq = seq.strip()
    ID = t.name
    inputs.append(f"SEQUENCE_ID={ID}\nSEQUENCE_TEMPLATE={seq}\n{settings}")

print(len(inputs))
for g in inputs:
    print(g)

2
SEQUENCE_ID=Sepiderm-dnaG
SEQUENCE_TEMPLATE=TAAGATTGGTTGTCCTGTTATTGACGTTTCAGATAAAGCCATTGAAGAAACAGCAAATGATATTATTCATTTTATCGAACAAAATAAATCGAAATGATTTCTTTTCCAATAAATTGAAGGTATAATAATAGAACTTTAATAAGATTGCTACCATAGAAATAAATGTATTTTATAGGTGGTATTTAATGTTTGAAATATATTCCTTCATTCTTTTAGCAACCACATAAAAATAAACAATTAGCAATTATGAATATTGATAGGTGATGTCTTTTGCGTATAGATCAATCCGTCATTGATGAAATAAAAAATAAAACTGATATATTAGATTTAGTTAGTGAATATGTAAAACTTGAAAAAAGAGGACGCAATTATATCGGTTTGTGTCCTTTTCATGATGAAAAAACACCCTCATTTACAGTTTCAGAAGATAAACAAATTTGTCATTGTTTTGGATGTAAAAAAGGTGGTAATGTTTTTCAATTTACGCAAGAAATTAAAGACGTATCATTTGTTGAAGCTGTAAAGGACTTGGGTGAACGAGTTAATATTCAAGTTGATATCGGGCAGAACCAAACAAATTCCTCGACAAAAATTGCGTCTGATGAGTTGAAAATGATTGAGATGCATGAACTCATTAAAGATTATTATCATTATGCTTTAATGAAAACAGTTGAAGGTGAGGAAGCCCTAAATTATTTACATGAACGCGGCTTTACGGATGACCTTATTAAAGAAAGAGAAATTGGATATGCACCTGATAACTCACATTTTTGTCATGATTTTCTTGAAAAAAAAGGATATGATATAGAACTAGCATTTGAAGCGGGTTTGTTATCTCGTAATGAAGAGAATTTCACATATTTTGATAGATTTAGAAATAGAATTATGTTTCCATTAAAGAATGGACAAGGACGAATTGTTGGGTATTCAGGACGGACATATACTGATCAAGAA

In [47]:
#run a primer3 for each of these TARGETS
import subprocess

outs = []
for i in range(len(inputs)):
    outs.append(subprocess.run(f"{pmr3}", input = inputs[i], capture_output = True, shell = True, text = True).stdout)

outs = [o.split("\n") for o in outs]

#keep track of errors in the output
for o in outs:
    for r in o:
        if r.find("ERROR") != -1:
            print("error detected in output!")
            print(o)

#parse boulder outputs
parsedouts = []
for record in outs:
    parseddict = {}
    for entry in record[:-2]:   #last 2 lines of every record is filler
        entry = entry.split("=")
        parseddict[entry[0]] = entry[1]
    parsedouts.append(parseddict)
parsedouts[0]

{'SEQUENCE_ID': 'Sepiderm-dnaG',
 'SEQUENCE_TEMPLATE': 'TAAGATTGGTTGTCCTGTTATTGACGTTTCAGATAAAGCCATTGAAGAAACAGCAAATGATATTATTCATTTTATCGAACAAAATAAATCGAAATGATTTCTTTTCCAATAAATTGAAGGTATAATAATAGAACTTTAATAAGATTGCTACCATAGAAATAAATGTATTTTATAGGTGGTATTTAATGTTTGAAATATATTCCTTCATTCTTTTAGCAACCACATAAAAATAAACAATTAGCAATTATGAATATTGATAGGTGATGTCTTTTGCGTATAGATCAATCCGTCATTGATGAAATAAAAAATAAAACTGATATATTAGATTTAGTTAGTGAATATGTAAAACTTGAAAAAAGAGGACGCAATTATATCGGTTTGTGTCCTTTTCATGATGAAAAAACACCCTCATTTACAGTTTCAGAAGATAAACAAATTTGTCATTGTTTTGGATGTAAAAAAGGTGGTAATGTTTTTCAATTTACGCAAGAAATTAAAGACGTATCATTTGTTGAAGCTGTAAAGGACTTGGGTGAACGAGTTAATATTCAAGTTGATATCGGGCAGAACCAAACAAATTCCTCGACAAAAATTGCGTCTGATGAGTTGAAAATGATTGAGATGCATGAACTCATTAAAGATTATTATCATTATGCTTTAATGAAAACAGTTGAAGGTGAGGAAGCCCTAAATTATTTACATGAACGCGGCTTTACGGATGACCTTATTAAAGAAAGAGAAATTGGATATGCACCTGATAACTCACATTTTTGTCATGATTTTCTTGAAAAAAAAGGATATGATATAGAACTAGCATTTGAAGCGGGTTTGTTATCTCGTAATGAAGAGAATTTCACATATTTTGATAGATTTAGAAATAGAATTATGTTTCCATTAAAGAATGGACAAGGACGAATTGTTGGGTATTCAGGACGGACATATAC

In [48]:
#extract primer sequences from each parsed output

#these classes help to keep track of primers and genes from the primer3 output
class gene:
    def __init__(self, name, primers):
        self.name = name
        self.primers = primers
        return
    
class primer_pair:
    def __init__(self, fseq, rseq, name, count, ftm, rtm, amplen):
        self.id = f"{name}-{count}"
        self.gene = name
        self.fseq = fseq
        self.rseq = rseq
        self.ftm = ftm
        self.rtm = rtm
        self.ampliconlen = amplen
        return
            
genelist = {}
for i in range(len(inputs)):
    pmrpairs = []
    name = keeplist[i].name
    for j in range(numpmrs):
        #print(i,j)
        fseq = parsedouts[i][f"PRIMER_LEFT_{j}_SEQUENCE"]
        rseq = parsedouts[i][f"PRIMER_RIGHT_{j}_SEQUENCE"]
        ftm = parsedouts[i][f"PRIMER_LEFT_{j}_TM"]
        rtm = parsedouts[i][f"PRIMER_RIGHT_{j}_TM"]
        amplen = parsedouts[i][f"PRIMER_PAIR_{j}_PRODUCT_SIZE"]
        pmrpairs.append(primer_pair(fseq, rseq, name, j, ftm, rtm, amplen))
    genelist[name]=pmrpairs


In [41]:

#Add P5/P7 to the primer list
#add the P5 and P7 adaptors to genelist
P5 = "CTGCGTGTCTCCGACTCAGACT" #3' 22  bases of the P5-ionB-I5-1 oligo
P7 = "CAAGCAGAAGACGGCATACGAGAT"
p5p7 = primer_pair(P5, P7, "P5/P7", 0, 0 ,0, 0)
genelist["P5/P7"] = [p5p7]


#OPTIONAL: add 16S primers to the list as well
ionB16Sf = "cctacgggaggcagcagt" 
ionA16Sr = "ggactaccagggtatctaatcctgt" #truncated to 60bp
fseq = ionB16Sf
rseq = ionA16Sr
rRNApmr = primer_pair(fseq, rseq, "16S", 0, 0 ,0, 500)
genelist["16S"] = [rRNApmr]


'\n#Add adaptors to the primers\nIonA = "CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT"\nIonB = "CCATCTCATCCCTGCGTGTCTCCGACTCAGACT"\n\nfor g in genelist:\n    for p in genelist[g]:\n        p.fseq = IonA + p.fseq\n        p.rseq = IonB + p.rseq\n        #if len(p.fseq) > 60: #ntthal can\'t take longer than 60bp long\n        #    p.fseq = p.fseq[len(p.fseq)-60:] #so we truncate from 5\'            \n        #if len(p.rseq) > 60: #ntthal can\'t take longer than 60bp long\n        #    p.rseq = p.rseq[len(p.rseq)-60:] #so we truncate from 5\'\n\n'

In [49]:
#write out primers as a FASTA for primer-dimer analysis
#format is genename-count#-f/r
with open("Primers.txt", "w") as outfile:
    for g in genelist:
        count = 0
        for p in genelist[g]:
            outfile.write(f">{p.gene}-{count}-f\n")
            outfile.write(f"{p.fseq}\n")
            outfile.write(f">{p.gene}-{count}-r\n")
            outfile.write(f"{p.rseq}\n")
            count += 1

#export the data in pickle format for next script
import pickle
with open("Primers.pickle", "wb") as outfile:
    pickle.dump(genelist, outfile)