In [2]:
import urllib.request as urlr
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from pprint import pprint

In [3]:
def get_ref_microbe_taxids():
    """
    Download the latest bacterial genome assembly summary from the NCBI genome ftp site
    and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
    :return: pandas dataframe of bacteria reference genome data
    """
    assembly = urlr.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
    columns = ['assembly_accession', 'bioproject', 'biosample', 'wgs_master', 'refseq_category', 'taxid',
               'species_taxid', 'organism_name', 'infraspecific_name', 'isolate', 'version_status', 'assembly_level', 
               'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter', 'gbrs_paired_asm', 
               'paired_asm_comp', 'ftp_path', 'excluded_from_refseq', 'relation_to_type_material']
    data = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=2, names=columns)
    return data

In [4]:
microbes = get_ref_microbe_taxids()

In [5]:
wikigenomes =  microbes[microbes['refseq_category'] == 'reference genome']

In [23]:
wikigenomes = wikigenomes[wikigenomes['submitter'].str.contains("DOE")]

In [9]:
def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

In [20]:
def query_for_manual_assertion(taxid):
    query = '''PREFIX prov: <http://www.w3.org/ns/prov#>
    PREFIX p: <http://www.wikidata.org/prop/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

    SELECT distinct ?protein ?go_bp ?go_bpLabel ?determination ?determinationLabel ?reference_stated_inLabel ?reference_retrieved WHERE {
      ?taxon wdt:P685 '%s'.  
      ?protein wdt:P703 ?taxon;
               wdt:P681 ?go_bp ;
               p:P681 ?go_bp_statement .
      ?go_bp_statement pq:P459 ?determination .
      ?determination wdt:P31 wd:Q28955254.  # check for manual assertion 
      ?go_bp_statement prov:wasDerivedFrom/pr:P248 ?reference_stated_in . #where stated
      ?go_bp_statement prov:wasDerivedFrom/pr:P813 ?reference_retrieved . #when retrieved
      SERVICE wikibase:label {
        bd:serviceParam wikibase:language \"en\" .
      }
    }''' % (taxid)
    r = execute_query(query=query)
    return len(r['results']['bindings'])

In [24]:
wikigenomes['manual_assertions'] = wikigenomes_JGI['taxid'].apply(query_for_manual_assertion)

In [26]:
wikigenomes[['organism_name', 'taxid', 'submitter', 'seq_rel_date', 'manual_assertions']]

Unnamed: 0,organism_name,taxid,submitter,seq_rel_date,manual_assertions
6360,Pseudomonas syringae pv. syringae B728a,205918,DOE Joint Genome Institute,2005/05/12,2
26669,Rhodobacter sphaeroides 2.4.1,272943,DOE Joint Genome Institute,2013/10/23,19
26700,Rhodospirillum rubrum ATCC 11170,269796,DOE Joint Genome Institute,2005/12/12,0
26711,Chloroflexus aurantiacus J-10-fl,324602,US DOE Joint Genome Institute,2007/12/20,795
48282,Bacillus anthracis str. Sterne,260799,DOE Joint Genome Institute,2004/06/24,0
49229,[Bacillus thuringiensis] serovar konkukian str...,281309,DOE Joint Genome Institute,2004/12/27,2
50800,Moorella thermoacetica ATCC 39073,264732,DOE Joint Genome Institute,2005/12/12,0
51271,Lactobacillus paracasei ATCC 334,321967,"US DOE Joint Genome Institute (JGI), The Lacti...",2006/10/13,4
71723,Mesorhizobium ciceri biovar biserrulae WSM1271,765698,US DOE Joint Genome Institute,2011/01/07,0
72101,Brachybacterium faecium DSM 4810,446465,US DOE Joint Genome Institute (JGI-PGF),2009/08/25,0
