In [1]:
import requests
import xmltodict
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import concurrent.futures
import time

# Get 16s 'green algae' (1783) and 'more' (190) from Nucleotide database of NCBI

In [2]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(16s[All+Fields]+AND+%22eukaryotes%22[porgn])+AND+%22green+plants%22[porgn]+NOT+%22land+plants%22[porgn]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [3]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

1973

In [4]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [5]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 53.5 s, sys: 2.18 s, total: 55.7 s
Wall time: 18min 42s


In [6]:
len(algae_nc_list)

1973

In [7]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_GreenAlgae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [8]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 56.8 s, sys: 3.35 s, total: 1min
Wall time: 58min 37s


# Get 16s 'diatoms' (1770) from Nucleotide database of NCBI

In [21]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22diatoms%22[porgn:__txid2836]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [40]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

1770

In [41]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [42]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 48.5 s, sys: 2.07 s, total: 50.5 s
Wall time: 17min 40s


In [43]:
len(algae_nc_list)

1770

In [44]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Diatoms/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [45]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 51.1 s, sys: 3.02 s, total: 54.1 s
Wall time: 52min 4s


# Get 16s 'haptophytes' (662) from Nucleotide database of NCBI

In [2]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(16s[All+Fields]+AND+%22eukaryotes%22[porgn]+NOT+%22animals%22[porgn])+AND+%22haptophytes%22[porgn]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [3]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

662

In [4]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [5]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 17.1 s, sys: 706 ms, total: 17.8 s
Wall time: 5min 49s


In [6]:
len(algae_nc_list)

662

In [7]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Haptophytes/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [9]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 17.7 s, sys: 948 ms, total: 18.6 s
Wall time: 19min 15s


# Get 16s 'cryptomonads' (622) from Nucleotide database of NCBI

In [10]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22cryptomonads%22[porgn:__txid3027]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [11]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

622

In [12]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [13]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 16 s, sys: 726 ms, total: 16.7 s
Wall time: 5min 38s


In [14]:
len(algae_nc_list)

622

In [15]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Cryptomonads/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [16]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 16.5 s, sys: 900 ms, total: 17.4 s
Wall time: 17min 59s


# Get 16s 'red algae' (432) from Nucleotide database of NCBI

In [17]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22red+algae%22[porgn:__txid2763]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [18]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

432

In [19]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [20]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 11.1 s, sys: 525 ms, total: 11.6 s
Wall time: 4min 6s


In [21]:
len(algae_nc_list)

432

In [22]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_RedAlgae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [23]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 13.3 s, sys: 957 ms, total: 14.2 s
Wall time: 13min 1s


# Get 16s 'euglenoids' (416) from Nucleotide database of NCBI

In [24]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22euglenoids%22[porgn:__txid3035]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [25]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

416

In [26]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [27]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 10.9 s, sys: 414 ms, total: 11.3 s
Wall time: 3min 49s


In [28]:
len(algae_nc_list)

416

In [29]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Euglenoids/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [30]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 11.2 s, sys: 671 ms, total: 11.9 s
Wall time: 12min 4s


# Get 16s 'brown algae' (305) from Nucleotide database of NCBI

In [31]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22brown+algae%22[porgn:__txid2870]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [32]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

305

In [33]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [34]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 8.01 s, sys: 267 ms, total: 8.28 s
Wall time: 2min 55s


In [35]:
len(algae_nc_list)

305

In [36]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_BrownAlgae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [37]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 8.41 s, sys: 521 ms, total: 8.93 s
Wall time: 9min 4s


# Get 16s 'pelagophytes' (302) from Nucleotide database of NCBI

In [38]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(16s[All+Fields]+AND+%22Eukaryota%22[Primary+Organism]+NOT+%22Metazoa%22[Primary+Organism])+AND+%22Pelagophyceae%22[Primary+Organism]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [39]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

302

In [40]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [41]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 7.57 s, sys: 274 ms, total: 7.84 s
Wall time: 2min 41s


In [42]:
len(algae_nc_list)

302

In [45]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Pelagophytes/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [46]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 7.71 s, sys: 422 ms, total: 8.13 s
Wall time: 8min 54s


# Get 16s 'golden algae' (203) from Nucleotide database of NCBI

In [47]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22golden+algae%22[porgn:__txid2825]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [48]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

203

In [49]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [50]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 5.07 s, sys: 207 ms, total: 5.28 s
Wall time: 1min 52s


In [51]:
len(algae_nc_list)

203

In [52]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_GoldenAlgae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [53]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 5.17 s, sys: 323 ms, total: 5.5 s
Wall time: 5min 52s


# Get 16s 'dinoflagellates' (200) from Nucleotide database of NCBI

In [54]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22dinoflagellates%22[porgn:__txid2864]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [55]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

200

In [56]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [57]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 5 s, sys: 202 ms, total: 5.21 s
Wall time: 1min 48s


In [58]:
len(algae_nc_list)

200

In [59]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Dinoflagellates/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [60]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 5.12 s, sys: 241 ms, total: 5.36 s
Wall time: 5min 44s


# Get 16s 'ciliates' (107) from Nucleotide database of NCBI

In [61]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22ciliates%22[porgn:__txid5878]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [62]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

107

In [63]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [64]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 2.69 s, sys: 98.4 ms, total: 2.79 s
Wall time: 59.3 s


In [65]:
len(algae_nc_list)

107

In [66]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Ciliates/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [67]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 2.86 s, sys: 160 ms, total: 3.02 s
Wall time: 3min 6s


# Get 16s 'cercozoans' (32) from Nucleotide database of NCBI

In [68]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22cercozoans%22[porgn:__txid136419]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [69]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

32

In [70]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [71]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 807 ms, sys: 31.1 ms, total: 838 ms
Wall time: 17.4 s


In [72]:
len(algae_nc_list)

32

In [73]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Cercozoans/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [74]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 838 ms, sys: 27.4 ms, total: 865 ms
Wall time: 57.2 s


# Get 16s 'yellow-green algae' (5) from Nucleotide database of NCBI

In [75]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s[All+Fields]+AND+%22eukaryotes%22[porgn]+NOT+%22animals%22[porgn])+AND+%22eukaryotes%22[porgn]+NOT+%22ciliates%22[porgn]+NOT+%22dinoflagellates%22[porgn]+NOT+%22golden+algae%22[porgn]+NOT+%22apicomplexans%22[porgn]+NOT+%22pelagophytes%22[porgn]+NOT+%22brown+algae%22[porgn]+NOT+%22euglenoids%22[porgn]+NOT+%22red+algae%22[porgn]+NOT+%22trichomonads%22[porgn]+NOT+%22cryptomonads%22[porgn]+NOT+%22haptophytes%22[porgn]+NOT+%22diatoms%22[porgn]+NOT+%22microsporidians%22[porgn]+NOT+%22ascomycetes%22[porgn]+NOT+%22fungi%22[porgn]+NOT+%22green+algae%22[porgn]+NOT+%22vascular+plants%22[porgn]+NOT+%22land+plants%22[porgn]+NOT+%22green+plants%22[porgn])+AND+%22yellow-green+algae%22[porgn]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [76]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

5

In [77]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [78]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 117 ms, sys: 3.78 ms, total: 121 ms
Wall time: 2.97 s


In [79]:
len(algae_nc_list)

5

In [80]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_YellowGreenAlgae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [81]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 161 ms, sys: 12.2 ms, total: 173 ms
Wall time: 9.31 s


# Get 16s 'diplomonads' (59) from Nucleotide database of NCBI

In [3]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((16s[All+Fields]+AND+"Eukaryota"[Primary+Organism]+NOT+"Metazoa"[Primary+Organism])+AND+"Eukaryota"[Primary+Organism]+NOT+"Ciliophora"[Primary+Organism]+NOT+"Dinophyceae"[Primary+Organism]+NOT+"Chrysophyceae"[Primary+Organism]+NOT+"Apicomplexa"[Primary+Organism]+NOT+"Pelagophyceae"[Primary+Organism]+NOT+"Phaeophyceae"[Primary+Organism]+NOT+"Euglenida"[Primary+Organism]+NOT+"Rhodophyta"[Primary+Organism]+NOT+"Trichomonadida"[Primary+Organism]+NOT+"Cryptophyceae"[Primary+Organism]+NOT+"Haptophyta"[Primary+Organism]+NOT+"Bacillariophyta"[Primary+Organism]+NOT+"Microsporidia"[Primary+Organism]+NOT+"Ascomycota"[Primary+Organism]+NOT+"Fungi"[Primary+Organism]+NOT+"Chlorophyta"[Primary+Organism]+NOT+"Tracheophyta"[Primary+Organism]+NOT+"Embryophyta"[Primary+Organism]+NOT+"Viridiplantae"[Primary+Organism])+AND+"Diplomonadida"[Primary+Organism]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [4]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

59

In [5]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [6]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 1.53 s, sys: 67.1 ms, total: 1.6 s
Wall time: 40.1 s


In [7]:
len(algae_nc_list)

59

In [8]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Diplomonads/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [9]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 1.61 s, sys: 66.3 ms, total: 1.67 s
Wall time: 1min 43s


# Get 16s 'kinetoplastids' (47) from Nucleotide database of NCBI

In [10]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22kinetoplastids%22[porgn:__txid5653]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [11]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

47

In [12]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [13]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 1.24 s, sys: 48.8 ms, total: 1.28 s
Wall time: 26.5 s


In [14]:
len(algae_nc_list)

47

In [15]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Kinetoplastids/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [16]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 2.11 s, sys: 220 ms, total: 2.33 s
Wall time: 1min 44s


# Get 16s 'forams' (40) from Nucleotide database of NCBI

In [17]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22forams%22[porgn:__txid29178]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [18]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

40

In [19]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [20]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 1.05 s, sys: 43.9 ms, total: 1.09 s
Wall time: 37 s


In [21]:
len(algae_nc_list)

40

In [22]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Forams/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [23]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 1.07 s, sys: 43.2 ms, total: 1.11 s
Wall time: 1min 11s


# Get 16s 'oomycetes' (36) from Nucleotide database of NCBI¶

In [24]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22oomycetes%22[porgn:__txid4762]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [25]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

36

In [26]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [27]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 923 ms, sys: 56.8 ms, total: 980 ms
Wall time: 23.8 s


In [28]:
len(algae_nc_list)

36

In [29]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Oomycetes/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [30]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 1.65 s, sys: 131 ms, total: 1.78 s
Wall time: 1min 18s


# Get 16s 'plasmodial slime molds' (13) from Nucleotide database of NCBI

In [31]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22plasmodial+slime+molds%22[porgn:__txid33680]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [32]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

13

In [33]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [34]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 327 ms, sys: 33.4 ms, total: 360 ms
Wall time: 8.2 s


In [35]:
len(algae_nc_list)

13

In [36]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Plasm-Slime-Molds/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [37]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 335 ms, sys: 23.6 ms, total: 358 ms
Wall time: 22.3 s


# Get 16s 'cellular slime molds' (5) from Nucleotide database of NCBI

In [38]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22cellular+slime+molds%22[porgn:__txid33083]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [39]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

5

In [40]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [41]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 122 ms, sys: 17.1 ms, total: 139 ms
Wall time: 7.28 s


In [42]:
len(algae_nc_list)

5

In [43]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Cell-Slime-Molds/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [44]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 527 ms, sys: 97.3 ms, total: 624 ms
Wall time: 16 s


# Get 16s 'slime nets' (5) from Nucleotide database of NCBI

In [45]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22slime+nets%22[porgn:__txid35131])+AND+%22slime+nets%22[porgn:__txid35131]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [46]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

5

In [47]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [48]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 137 ms, sys: 4.1 ms, total: 141 ms
Wall time: 3.58 s


In [49]:
len(algae_nc_list)

5

In [50]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Slime-Nets/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [51]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 142 ms, sys: 579 µs, total: 142 ms
Wall time: 8.96 s


# Get 16s 'uncultured phototrophic eukaryote' (1873) from Nucleotide database of NCBI

In [2]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((16s[All+Fields]+AND+%22eukaryotes%22[porgn]+NOT+%22animals%22[porgn])+AND+%22eukaryotes%22[porgn]+NOT+%22ciliates%22[porgn]+NOT+%22dinoflagellates%22[porgn]+NOT+%22golden+algae%22[porgn]+NOT+%22apicomplexans%22[porgn]+NOT+%22pelagophytes%22[porgn]+NOT+%22brown+algae%22[porgn]+NOT+%22euglenoids%22[porgn]+NOT+%22red+algae%22[porgn]+NOT+%22trichomonads%22[porgn]+NOT+%22cryptomonads%22[porgn]+NOT+%22haptophytes%22[porgn]+NOT+%22diatoms%22[porgn]+NOT+%22microsporidians%22[porgn]+NOT+%22ascomycetes%22[porgn]+NOT+%22fungi%22[porgn]+NOT+%22green+algae%22[porgn]+NOT+%22vascular+plants%22[porgn]+NOT+%22land+plants%22[porgn]+NOT+%22green+plants%22[porgn])+AND+%22eukaryotes%22[porgn]+NOT+%22slime+nets%22[porgn]+NOT+%22yellow-green+algae%22[porgn]+NOT+%22cellular+slime+molds%22[porgn]+NOT+%22plasmodial+slime+molds%22[porgn]+NOT+%22cercozoans%22[porgn]+NOT+%22oomycetes%22[porgn]+NOT+%22forams%22[porgn]+NOT+%22kinetoplastids%22[porgn]+NOT+%22diplomonads%22[porgn])+AND+%22uncultured+phototrophic+eukaryote%22[porgn]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [3]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

1873

In [4]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [5]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 46.1 s, sys: 1.89 s, total: 48 s
Wall time: 18min 16s


In [6]:
len(algae_nc_list)

1873

In [7]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_UnCul-Phototroph-Euk/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [8]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 47.5 s, sys: 2.55 s, total: 50 s
Wall time: 54min 51s


# Get 16s 'uncultured eukaryotic phytoplankton' (183) from Nucleotide database of NCBI

In [19]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22eukaryotes%22[porgn:__txid2759])+AND+%22uncultured+eukaryotic+phytoplankton%22[porgn:__txid1737140]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [20]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

183

In [21]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [22]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 4.55 s, sys: 168 ms, total: 4.72 s
Wall time: 1min 40s


In [23]:
len(algae_nc_list)

183

In [24]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_UnCul-Euk-Phytoplank/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [25]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 4.63 s, sys: 244 ms, total: 4.88 s
Wall time: 5min 21s


# Get 16s 'uncultured protist' (117) from Nucleotide database of NCBI

In [52]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22uncultured+protist%22[porgn:__txid1295078]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [53]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

117

In [54]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [55]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 3.04 s, sys: 143 ms, total: 3.18 s
Wall time: 1min 16s


In [57]:
len(algae_nc_list)

117

In [58]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_Uncult-Protist/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [59]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 3.05 s, sys: 200 ms, total: 3.25 s
Wall time: 3min 41s


# Get 16s 'Nannochloropsis oceanica' (150) from Nucleotide database of NCBI

In [98]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22eukaryotes%22[porgn:__txid2759])+AND+%22Nannochloropsis+oceanica%22[porgn:__txid145522]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [99]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

150

In [100]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [101]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 3.73 s, sys: 163 ms, total: 3.9 s
Wall time: 1min 24s


In [102]:
len(algae_nc_list)

150

In [103]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Nannochloropsis-oceanica/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [104]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 3.8 s, sys: 236 ms, total: 4.04 s
Wall time: 4min 25s


# Get 16s 'uncultured algae' (69) from Nucleotide database of NCBI

In [91]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '(((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22eukaryotes%22[porgn:__txid2759])+AND+%22uncultured+alga%22[porgn:__txid171248]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [92]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

69

In [93]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [94]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 1.76 s, sys: 55.4 ms, total: 1.81 s
Wall time: 38.2 s


In [95]:
len(algae_nc_list)

69

In [96]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Uncul-Algae/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [97]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 1.74 s, sys: 119 ms, total: 1.86 s
Wall time: 1min 59s


# Get 16s 'uncultured marine phototrophic eukaryote' (51) from Nucleotide database of NCBI

In [42]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22uncultured+marine+phototrophic+eukaryote%22[porgn:__txid544549]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [43]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

51

In [44]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [45]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 1.3 s, sys: 43.8 ms, total: 1.34 s
Wall time: 1min 48s


In [46]:
len(algae_nc_list)

51

In [47]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_Uncul-Marine-Phototroph-Euk/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [48]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 1.31 s, sys: 62.8 ms, total: 1.37 s
Wall time: 1min 59s


# Get 16s 'All Other Taxa' (865) from Nucleotide database of NCBI# 

In [2]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"
search_term = '((((((16s)+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22animals%22[porgn:__txid33208])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22ciliates%22[porgn:__txid5878]+NOT+%22dinoflagellates%22[porgn:__txid2864]+NOT+%22golden+algae%22[porgn:__txid2825]+NOT+%22apicomplexans%22[porgn:__txid5794]+NOT+%22pelagophytes%22[porgn:__txid35675]+NOT+%22brown+algae%22[porgn:__txid2870]+NOT+%22euglenoids%22[porgn:__txid3035]+NOT+%22red+algae%22[porgn:__txid2763]+NOT+%22trichomonads%22[porgn:__txid37104]+NOT+%22cryptomonads%22[porgn:__txid3027]+NOT+%22haptophytes%22[porgn:__txid2830]+NOT+%22diatoms%22[porgn:__txid2836]+NOT+%22microsporidians%22[porgn:__txid6029]+NOT+%22ascomycetes%22[porgn:__txid4890]+NOT+%22fungi%22[porgn:__txid4751]+NOT+%22green+algae%22[porgn:__txid3041]+NOT+%22vascular+plants%22[porgn:__txid58023]+NOT+%22land+plants%22[porgn:__txid3193]+NOT+%22green+plants%22[porgn:__txid33090])+AND+%22eukaryotes%22[porgn:__txid2759]+NOT+%22slime+nets%22[porgn:__txid35131]+NOT+%22yellow-green+algae%22[porgn:__txid2833]+NOT+%22cellular+slime+molds%22[porgn:__txid33083]+NOT+%22plasmodial+slime+molds%22[porgn:__txid33680]+NOT+%22cercozoans%22[porgn:__txid136419]+NOT+%22oomycetes%22[porgn:__txid4762]+NOT+%22forams%22[porgn:__txid29178]+NOT+%22kinetoplastids%22[porgn:__txid5653]+NOT+%22diplomonads%22[porgn:__txid5738])+AND+%22eukaryotes%22[porgn:__txid2759])+NOT+%22uncultured+eukaryotic+phytoplankton%22[porgn:__txid1737140]+NOT+%22uncultured+picoeukaryote%22[porgn:__txid1141622]+NOT+%22uncultured+marine+eukaryote%22[porgn:__txid203449]+NOT+%22uncultured+phototrophic+eukaryote%22[porgn:__txid172788]+NOT+%22uncultured+eukaryote%22[porgn:__txid100272])+NOT+%22uncultured+marine+phototrophic+eukaryote%22[porgn:__txid544549]+NOT+%22uncultured+marine+rappemonad%22[porgn:__txid884108]+NOT+%22uncultured+alga%22[porgn:__txid171248]+NOT+%22uncultured+protist%22[porgn:__txid1295078]+NOT+%22Nannochloropsis+oceanica%22[porgn:__txid145522]'
search = "esearch.fcgi?db={}&term={}&retmax=100000".format(db, search_term)

In [3]:
esearch_response = requests.get(base_url.format(search))
esearch_dict = xmltodict.parse(esearch_response.content)
IdList = esearch_dict['eSearchResult']['IdList']['Id']
len(IdList)

865

In [4]:
algae_nc_list = []

def get_locus(taxon_id):
    try:
        search = "efetch.fcgi?db={}&id={}&rettype=acc".format(db, taxon_id)
        locus_id = requests.get(base_url.format(search)).text
        locus_id = locus_id.split('.')
        locus_id = locus_id[0]
        algae_nc_list.append(locus_id)
        
    except:
        print (locus_id, "NULL")
        pass

In [5]:
%%time
for taxon_id in IdList:
    get_locus(taxon_id)

CPU times: user 22.2 s, sys: 953 ms, total: 23.1 s
Wall time: 8min 30s


In [6]:
len(algae_nc_list)

865

In [7]:
def get_fasta(nuc_id):
    try:
        fasta = "efetch.fcgi?db=nuccore&id={}.1&rettype=fasta".format(nuc_id)
        download_fasta = requests.get(base_url.format(fasta))
        file = open("16S_marker_NCBI/16S_All-other-taxa_865/" + "{}.fasta".format(nuc_id), "w")
        file.write(download_fasta.text)
        file.close()
        time.sleep(1)
    except:
        print (nuc_id, "Not Found")
        time.sleep(1)
        pass

In [8]:
%%time
for locus_id in algae_nc_list:
    get_fasta(locus_id)

CPU times: user 24.1 s, sys: 1.5 s, total: 25.6 s
Wall time: 26min 6s
