In [5]:
#Primer3 primer generator
#this script inputs a list of genes and uses primer3 to generate a list of primers for each gene
#this script requires a local installation of primer3 for generating primers

#Modify parameters below to suit your needs

#Location of the primer3 program
pmr3 = "~/primer3/src/primer3_core"

#number of primers to generate for each gene - more primers mean more running time for the second step of calculating deltaGs, but gives higher probability of finding good primersets.
numpmrs = 100 

#Actual settings used in primer3 - these should not need to be changed for DoTA-seq primers
settings = f"\
PRIMER_TASK=generic\n\
PRIMER_PICK_LEFT_PRIMER=1\n\
PRIMER_PICK_INTERNAL_OLIGO=0\n\
PRIMER_PICK_RIGHT_PRIMER=1\n\
PRIMER_OPT_TM=60\n\
PRIMER_MIN_TM=55\n\
PRIMER_MAX_TM=63\n\
PRIMER_OPT_SIZE=20\n\
PRIMER_MIN_SIZE=18\n\
PRIMER_MAX_SIZE=25\n\
PRIMER_MAX_END_GC=2\n\
PRIMER_MAX_GC=50\n\
PRIMER_SALT_DIVALENT=2\n\
PRIMER_SALT_MONOVALENT=100\n\
PRIMER_PRODUCT_SIZE_RANGE=400-500\n\
PRIMER_NUM_RETURN={numpmrs}\n\
PRIMER_EXPLAIN_FLAG=0\n\
P3_FILE_FLAG=0\n\
="



In [2]:
import os
os.chdir("/home/flan3@ad.wisc.edu/MplexPrimerPicker/projects/zymofecal") #directory containing the list of genes to make primers for
os.listdir()


['targets.fa',
 'Primers.txt',
 'Primers.pickle',
 'adjdict-noadaptors.pickle',
 'top10-sets.csv',
 'top-10-sets.pickle']

In [3]:
#time ito import the target genes
from Bio import SeqIO 

#import the TARGETs to get primers for (in fasta format)
filename = "targets.fa" #name of file containing target genes in fasta format
targets = list(SeqIO.parse(filename, "fasta"))

#OPTIONAL: remove genes from targets we dont' want
#remove = ["tet(C)", "tet(W/N/W)", "tetA(P)", "tetB(P)"]
remove = []
keeplist = []
for g in targets:
    found = False
    for r in remove:
        if g.name.find(r) != -1: #if its not -1 then it is found
            found = True
            break
    if found == False:
        keeplist.append(g)
        
keeplist
           

[SeqRecord(seq=Seq('ATGTCCGATGAGATCGTGAAGTACTCGAACCAGTTCAACAATCAGGCGCTGCGC...TGA', SingleLetterAlphabet()), id='rep_cluster_1320', name='rep_cluster_1320', description='rep_cluster_1320', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGTAAATAGAGAAATAAAAGTTCGAAAACGTGCGGTTAAAGAAGAAAAAGAG...TAA', SingleLetterAlphabet()), id='rep_cluster_663', name='rep_cluster_663', description='rep_cluster_663', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGAATATCCCTTTTGTTGTAGAAACTGTGCTTCATGACGGCTTGTTAAAGTAC...TGC', SingleLetterAlphabet()), id='Inc18', name='Inc18', description='Inc18', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTCCAATGAGATCGTGAAGTTCAGCAACCAGTTCAACAACGTGGCACTGAAG...TAG', SingleLetterAlphabet()), id='rep_cluster_1351', name='rep_cluster_1351', description='rep_cluster_1351', dbxrefs=[]),
 SeqRecord(seq=Seq('GTGGAGAATAATATTTCCCAAAAACACATCAAAATGGAAAATAAAAAAGCAGTT...TGA', SingleLetterAlphabet()), id='rep_cluster_1868', name='rep_cluster_1868', description='rep_cluster_1868', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGAGCAAAGC

In [6]:
#make the BOULDER-IO entries for each gene, which is the format that primer3 uses as input
inputs = []
for t in keeplist:
    seq = str(t.seq).replace("-", '') #strip the "-" because primer3 can't handle it
    seq = seq.strip()
    ID = t.name
    inputs.append(f"SEQUENCE_ID={ID}\nSEQUENCE_TEMPLATE={seq}\n{settings}")

print(f"{len(inputs)} genes to make primers for")
for g in inputs:
    print(g)

10 genes to make primers for
SEQUENCE_ID=rep_cluster_1320
SEQUENCE_TEMPLATE=ATGTCCGATGAGATCGTGAAGTACTCGAACCAGTTCAACAATCAGGCGCTGCGCAAGTTCACCGCGCTCGATCTTGATTTGCTGATGGCCATCGCGTCTCGCGTGCGGGACAAGGGCACGGACGAGGTGGCGTTCACGTTCGAAGAATTGAAGCGGCTGGCCGGACTTCAGAGGAACATGACCAACGATGAGTTCGCCAAGCAGATTGCCAATGTGAACCGTCGCCTGCTCGCGCTCAACTTCGAGTTCCAGAACGAAGAGCACGACATCATCCAGTTCGCCCTGTTCGCAGGCTTCGTCACCAGCCCGACCAAACGCACACTCACCGTATCGGTGAACTCGCGCTTCTCGTTCCTGCTTAACGACCTCACCTCGCAGTTCACCCGCTTCGAGCTGGCCGAGTTCACGGCCCTGCGCTCCAGCTACGCCAAGGAGTGCTACAGGAGGCTCAAGCAGTACCGACAAACCGGTGTGTGGAAGGTCAGCCTTGAGGACTTCCGCCGACTGCTCGACGTACCCAAATCGTATCGGCCGAGCGAAATCAACAAGTATGTTCTTAAGCCGATTGAGGAGGAGTTGGGACCTCTGCTGAATCTGAAGGTGCATCGCAAGTACCTGAAGAAGAAGCCGGGACGTGGCCGCGCTTCCCTTGTCGGGTTCGAATTTGAATTTGACCCGGAGAAGGTACCCGGCGGCGCTCCCGCTCCGCGCGTGGAGCTGTCCGGCTCCGTGGTGACCGATGAGGCGAGGAAGTCGCTCAGGGACGTGTCGAAGACGCCCGTGCCCGATCTGTCGGTGCCGGGCGAGGGACCGGCGCTCGACCCTGATACTCAGGCGTTCCTCGACGCGCACGGCGGTCATGGATTCGCCGGTGCCGAGGGGTATGTCTCCGACTGGCCGAGTGCTGCGTGA
PRIMER_TASK

In [7]:
#run a primer3 for each of these TARGETS
import subprocess

outs = []
for i in range(len(inputs)):
    outs.append(subprocess.run(f"{pmr3}", input = inputs[i], capture_output = True, shell = True, text = True).stdout)

outs = [o.split("\n") for o in outs]

#keep track of errors in the output
for o in outs:
    for r in o:
        if r.find("ERROR") != -1:
            print("error detected in output!")
            print(o)

#parse boulder outputs
parsedouts = []
for record in outs:
    parseddict = {}
    for entry in record[:-2]:   #last 2 lines of every record is filler
        entry = entry.split("=")
        parseddict[entry[0]] = entry[1]
    parsedouts.append(parseddict)
parsedouts[0]

{'SEQUENCE_ID': 'rep_cluster_1320',
 'SEQUENCE_TEMPLATE': 'ATGTCCGATGAGATCGTGAAGTACTCGAACCAGTTCAACAATCAGGCGCTGCGCAAGTTCACCGCGCTCGATCTTGATTTGCTGATGGCCATCGCGTCTCGCGTGCGGGACAAGGGCACGGACGAGGTGGCGTTCACGTTCGAAGAATTGAAGCGGCTGGCCGGACTTCAGAGGAACATGACCAACGATGAGTTCGCCAAGCAGATTGCCAATGTGAACCGTCGCCTGCTCGCGCTCAACTTCGAGTTCCAGAACGAAGAGCACGACATCATCCAGTTCGCCCTGTTCGCAGGCTTCGTCACCAGCCCGACCAAACGCACACTCACCGTATCGGTGAACTCGCGCTTCTCGTTCCTGCTTAACGACCTCACCTCGCAGTTCACCCGCTTCGAGCTGGCCGAGTTCACGGCCCTGCGCTCCAGCTACGCCAAGGAGTGCTACAGGAGGCTCAAGCAGTACCGACAAACCGGTGTGTGGAAGGTCAGCCTTGAGGACTTCCGCCGACTGCTCGACGTACCCAAATCGTATCGGCCGAGCGAAATCAACAAGTATGTTCTTAAGCCGATTGAGGAGGAGTTGGGACCTCTGCTGAATCTGAAGGTGCATCGCAAGTACCTGAAGAAGAAGCCGGGACGTGGCCGCGCTTCCCTTGTCGGGTTCGAATTTGAATTTGACCCGGAGAAGGTACCCGGCGGCGCTCCCGCTCCGCGCGTGGAGCTGTCCGGCTCCGTGGTGACCGATGAGGCGAGGAAGTCGCTCAGGGACGTGTCGAAGACGCCCGTGCCCGATCTGTCGGTGCCGGGCGAGGGACCGGCGCTCGACCCTGATACTCAGGCGTTCCTCGACGCGCACGGCGGTCATGGATTCGCCGGTGCCGAGGGGTATGTCTCCGACTGGCCGAGTGCTGCGTGA',
 'PRIMER_TASK': 'generic',

In [8]:
#extract primer sequences from the output of primer3
#these classes help to keep track of primers and genes from the primer3 output
class gene:
    def __init__(self, name, primers):
        self.name = name
        self.primers = primers
        return
    
class primer_pair:
    def __init__(self, fseq, rseq, name, count, ftm, rtm, amplen):
        self.id = f"{name}-{count}"
        self.gene = name
        self.fseq = fseq
        self.rseq = rseq
        self.ftm = ftm
        self.rtm = rtm
        self.ampliconlen = amplen
        return
            
genelist = {}
for i in range(len(inputs)):
    pmrpairs = []
    name = keeplist[i].name
    for j in range(numpmrs):
        #print(i,j)
        fseq = parsedouts[i][f"PRIMER_LEFT_{j}_SEQUENCE"]
        rseq = parsedouts[i][f"PRIMER_RIGHT_{j}_SEQUENCE"]
        ftm = parsedouts[i][f"PRIMER_LEFT_{j}_TM"]
        rtm = parsedouts[i][f"PRIMER_RIGHT_{j}_TM"]
        amplen = parsedouts[i][f"PRIMER_PAIR_{j}_PRODUCT_SIZE"]
        pmrpairs.append(primer_pair(fseq, rseq, name, j, ftm, rtm, amplen))
    genelist[name]=pmrpairs


In [9]:
#OPTIONAL: Add P5/P7 adaptor sequences to the primer list for calclating deltaG free energies
#These are not P5/P7 sequences but the 3' end of the DOTA-seq adaptor oligos

P5 = "CTGCGTGTCTCCGACTCAGACT" #3' 22  bases of the P5-ionB-I5-1 oligo
P7 = "CAAGCAGAAGACGGCATACGAGAT"
p5p7 = primer_pair(P5, P7, "P5/P7", 0, 0 ,0, 0)
genelist["P5/P7"] = [p5p7]


#OPTIONAL: add 16S primers to the list as well
ionB16Sf = "cctacgggaggcagcagt" 
ionA16Sr = "ggactaccagggtatctaatcctgt" #truncated to 60bp
fseq = ionB16Sf
rseq = ionA16Sr
rRNApmr = primer_pair(fseq, rseq, "16S", 0, 0 ,0, 500)
genelist["16S"] = [rRNApmr]


In [10]:
#write out all generated primers as a fasta file
#format is genename-count#-f/r
with open("Primers.txt", "w") as outfile:
    for g in genelist:
        count = 0
        for p in genelist[g]:
            outfile.write(f">{p.gene}-{count}-f\n")
            outfile.write(f"{p.fseq}\n")
            outfile.write(f">{p.gene}-{count}-r\n")
            outfile.write(f"{p.rseq}\n")
            count += 1

#also export the data in pickle format for next script
import pickle
with open("AllPrimers.pickle", "wb") as outfile:
    pickle.dump(genelist, outfile)