In [79]:
import urllib.request as urlr
import pandas as pd

In [80]:
def get_ref_microbe_taxids():
    """
    Download the latest bacterial genome assembly summary from the NCBI genome ftp site
    and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
    :return: pandas dataframe of bacteria reference genome data
    """
    assembly = urlr.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
    columns = ['assembly_accession', 'bioproject', 'biosample', 'wgs_master', 'refseq_category', 'taxid',
               'species_taxid', 'organism_name', 'infraspecific_name', 'isolate', 'version_status', 'assembly_level', 
               'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter', 'gbrs_paired_asm', 
               'paired_asm_comp', 'ftp_path', 'excluded_from_refseq', 'relation_to_type_material']
    data = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=2, names=columns)
    return data

In [81]:
microbes = get_ref_microbe_taxids()

In [82]:
wikigenomes =  microbes[microbes['refseq_category'] == 'reference genome']

In [83]:
wikigenomes_JGI = wikigenomes[wikigenomes['submitter'].str.contains("DOE")]

In [88]:
wikigenomes_JGI

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material
6360,GCF_000012245.1,PRJNA57931,SAMN02604347,,reference genome,205918,317,Pseudomonas syringae pv. syringae B728a,strain=B728a,,...,Major,Full,2005/05/12,ASM1224v1,DOE Joint Genome Institute,GCA_000012245.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
26669,GCF_000012905.2,PRJNA57653,SAMN02598537,,reference genome,272943,1063,Rhodobacter sphaeroides 2.4.1,strain=2.4.1; ATCC BAA-808,,...,Major,Full,2013/10/23,ASM1290v2,DOE Joint Genome Institute,GCA_000012905.2,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
26700,GCF_000013085.1,PRJNA57655,SAMN02598538,,reference genome,269796,1085,Rhodospirillum rubrum ATCC 11170,strain=ATCC 11170,,...,Major,Full,2005/12/12,ASM1308v1,DOE Joint Genome Institute,GCA_000013085.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,assembly from type material
26711,GCF_000018865.1,PRJNA57657,SAMN02598539,,reference genome,324602,1108,Chloroflexus aurantiacus J-10-fl,strain=J-10-fl,,...,Major,Full,2007/12/20,ASM1886v1,US DOE Joint Genome Institute,GCA_000018865.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,assembly from type material
48282,GCF_000008165.1,PRJNA58091,SAMN02598266,,reference genome,260799,1392,Bacillus anthracis str. Sterne,strain=Sterne,,...,Major,Full,2004/06/24,ASM816v1,DOE Joint Genome Institute,GCA_000008165.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
49229,GCF_000008505.1,PRJNA58089,SAMN02598265,,reference genome,281309,1428,[Bacillus thuringiensis] serovar konkukian str...,strain=97-27,,...,Major,Full,2004/12/27,ASM850v1,DOE Joint Genome Institute,GCA_000008505.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
50800,GCF_000013105.1,PRJNA58051,SAMN02598257,,reference genome,264732,1525,Moorella thermoacetica ATCC 39073,strain=ATCC 39073,,...,Major,Full,2005/12/12,ASM1310v1,DOE Joint Genome Institute,GCA_000013105.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
51271,GCF_000014525.1,PRJNA57985,SAMN02598528,,reference genome,321967,1597,Lactobacillus paracasei ATCC 334,strain=ATCC 334,,...,Major,Full,2006/10/13,ASM1452v1,"US DOE Joint Genome Institute (JGI), The Lacti...",GCA_000014525.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
71723,GCF_000185905.1,PRJNA62101,SAMN00713576,,reference genome,765698,39645,Mesorhizobium ciceri biovar biserrulae WSM1271,strain=WSM1271,,...,Major,Full,2011/01/07,ASM18590v1,US DOE Joint Genome Institute,GCA_000185905.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,
72101,GCF_000023405.1,PRJNA58649,SAMN00002685,,reference genome,446465,43669,Brachybacterium faecium DSM 4810,strain=DSM 4810,,...,Major,Full,2009/08/25,ASM2340v1,US DOE Joint Genome Institute (JGI-PGF),GCA_000023405.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,assembly from type material


In [87]:
wikigenomes_JGI[['organism_name', 'submitter', 'seq_rel_date']]

Unnamed: 0,organism_name,submitter,seq_rel_date
6360,Pseudomonas syringae pv. syringae B728a,DOE Joint Genome Institute,2005/05/12
26669,Rhodobacter sphaeroides 2.4.1,DOE Joint Genome Institute,2013/10/23
26700,Rhodospirillum rubrum ATCC 11170,DOE Joint Genome Institute,2005/12/12
26711,Chloroflexus aurantiacus J-10-fl,US DOE Joint Genome Institute,2007/12/20
48282,Bacillus anthracis str. Sterne,DOE Joint Genome Institute,2004/06/24
49229,[Bacillus thuringiensis] serovar konkukian str...,DOE Joint Genome Institute,2004/12/27
50800,Moorella thermoacetica ATCC 39073,DOE Joint Genome Institute,2005/12/12
51271,Lactobacillus paracasei ATCC 334,"US DOE Joint Genome Institute (JGI), The Lacti...",2006/10/13
71723,Mesorhizobium ciceri biovar biserrulae WSM1271,US DOE Joint Genome Institute,2011/01/07
72101,Brachybacterium faecium DSM 4810,US DOE Joint Genome Institute (JGI-PGF),2009/08/25
