Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

86 lines (72 sloc) 2.58 kB
# Author: Harry Hull
# Date: 11/9/2012
# Program usage:
# usage: seqExtract.py [-h] -i INPUT -o OUTPUT -e EMAIL
# optional arguments:
# -h, --help show this help message and exit
# -i INPUT, --input INPUT
# The list of NCBI Bioproject IDs that have associated contigs.
# -o OUTPUT, --output OUTPUT
# The folder to store all of the fasta files
# -e EMAIL, --email EMAIL
# Your email so NCBI knows who they are talking to.
from Bio import Entrez, SeqIO
import argparse, sys, os
def parseFile(f):
lines = f.readlines()
ids = []
store = False
for line in lines:
if store:
if line.strip().isdigit():
ids.append(line.strip())
else:
store = False
elif line.startswith('BioProject'):
# get all of the ids until a non integer is found (the next section of the table)
store = True
return ids
def getSeq(email, ids, outputpath):
seqs = []
print 'sender\'s email: ' + email
Entrez.email = email
for id in ids:
# Make a link between bioproject database with nucleotide database
handle = Entrez.elink(dbfrom="bioproject", id=id, linkname="bioproject_nuccore")
record = Entrez.read(handle)
handle.close()
id_list = record[0]["LinkSetDb"][0]["Link"]
nuccore_ids = []
for link in id_list:
nuccore_ids.append(link['Id'])
# Get the associated nucleotide sequences for this bioproject
handle = Entrez.efetch(db="nuccore", id=nuccore_ids, rettype="fasta", retmode="text")
records = list(SeqIO.parse(handle, "fasta"))
handle.close()
# Save them all in one fasta file
SeqIO.write(records, outputpath+"bioprojectId:"+str(id) +"-"+ str(len(records)) + "-sequences.fasta", "fasta")
def main(args):
inputfile = args.input
outputpath = args.output
email = args.email
# attempt to open the input file
try:
f = open(inputfile)
except IOError:
print 'error: cannot open file:', inputfile
sys.exit(2)
if os.path.exists(outputpath):
ids = parseFile(f)
print ids
print len(ids)
getSeq(email, ids, outputpath)
f.close()
else:
print 'error: filePath is not valid', outputpath
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help='The list of NCBI Bioproject IDs to be downloaded', required=True)
parser.add_argument('-o', '--output', help='The folder to store all of the fasta files', required=True)
parser.add_argument('-e', '--email', help='Your email so NCBI knows who they are talking to.', required=True)
args = parser.parse_args()
main(args)
Jump to Line
Something went wrong with that request. Please try again.