In [14]:
import requests
import urllib.parse


endpoint = 'http://localhost:7777/sparql?'

parts_query = '''
PREFIX sbol2: <http://sbols.org/v2#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX ncbi: <http://www.ncbi.nlm.nih.gov#>
PREFIX synbiohub: <http://synbiohub.org#>
PREFIX sbh: <http://wiki.synbiohub.org/wiki/Terms/synbiohub#>
PREFIX igem: <http://wiki.synbiohub.org/wiki/Terms/igem#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
    ?subject
    ?displayId
    ?version
    ?name
    ?description
    ?type
WHERE {
    ?subject a sbol2:ComponentDefinition .
    ?subject a ?type .
    ?subject sbh:topLevel ?subject
    OPTIONAL { ?subject sbol2:displayId ?displayId . }
    OPTIONAL { ?subject sbol2:version ?version . }
    OPTIONAL { ?subject dcterms:title ?name . }
    OPTIONAL { ?subject dcterms:description ?description . }
} 
'''

url = endpoint + urllib.parse.urlencode({'query': parts_query})
print(url)
r = requests.get(url)

print(r.status_code)
print(r.encoding)
print(r.headers['content-type'])
print(r.content[0:1000])

f = open('parts_response.xml', 'w')
f.write(r.content.decode('utf8'))

http://localhost:7777/sparql?query=%0APREFIX+sbol2%3A+%3Chttp%3A%2F%2Fsbols.org%2Fv2%23%3E%0APREFIX+dcterms%3A+%3Chttp%3A%2F%2Fpurl.org%2Fdc%2Fterms%2F%3E%0APREFIX+ncbi%3A+%3Chttp%3A%2F%2Fwww.ncbi.nlm.nih.gov%23%3E%0APREFIX+synbiohub%3A+%3Chttp%3A%2F%2Fsynbiohub.org%23%3E%0APREFIX+sbh%3A+%3Chttp%3A%2F%2Fwiki.synbiohub.org%2Fwiki%2FTerms%2Fsynbiohub%23%3E%0APREFIX+igem%3A+%3Chttp%3A%2F%2Fwiki.synbiohub.org%2Fwiki%2FTerms%2Figem%23%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0A%0ASELECT+DISTINCT%0A++++%3Fsubject%0A++++%3FdisplayId%0A++++%3Fversion%0A++++%3Fname%0A++++%3Fdescription%0A++++%3Ftype%0AWHERE+%7B%0A++++%3Fsubject+a+sbol2%3AComponentDefinition+.%0A++++%3Fsubject+a+%3Ftype+.%0A++++%3Fsubject+sbh%3AtopLevel+%3Fsubject%0A++++OPTIONAL+%7B+%3Fsubject+sbol2%3AdisplayId+%3FdisplayId+.+%7D%0A++++OPTIONAL+%7B+%3Fsubject+sbol2%3Aversion+%3Fversion+.+%7D%0A++++OPTIONAL+%7B+%3Fsubject+dcterms%3Atitle+%3Fname+.+%7D%0A++++OPTIONAL+%7B+%3Fsubject+dcterms%3Adescripti

18794885

In [15]:
f = open('parts_response.xml', 'r')
parts_response = f.read()

In [16]:
from xml.etree import ElementTree


def create_parts(xml_content):
    parts = []
    
    ns = {'sparql_results': 'http://www.w3.org/2005/sparql-results#'}
    
    root = ElementTree.fromstring(xml_content)
    results = root.find('sparql_results:results', ns)

    for result in results.findall('sparql_results:result', ns):
        bindings = result.findall('sparql_results:binding', ns)

        subject = 'no subject'
        for binding in bindings:
            if binding.attrib['name'] == 'subject':
                subject = binding.find('sparql_results:uri', ns).text

        display_id = 'no displayId'
        for binding in bindings:
            if binding.attrib['name'] == 'displayId':
                display_id = binding.find('sparql_results:literal', ns).text
                
        version = 'no version'
        for binding in bindings:
            if binding.attrib['name'] == 'version':
                version = binding.find('sparql_results:literal', ns).text
        
        name = 'no name'
        for binding in bindings:
            if binding.attrib['name'] == 'name':
                name = binding.find('sparql_results:literal', ns).text
        
        description = 'no description'
        for binding in bindings:
            if binding.attrib['name'] == 'description':
                description = binding.find('sparql_results:literal', ns).text
                
        _type = 'no type'
        for binding in bindings:
            if binding.attrib['name'] == 'type':
                _type = binding.find('sparql_results:uri', ns).text

        part = {
            'subject': subject,
            'displayId': display_id,
            'version': version,
            'name': name,
            'description': description,
            'type': _type
        }
        parts.append(part)
    
    return parts

parts = create_parts(parts_response)
print('number of parts: ' + str(len(parts))) # TODO only fetching first 50000 parts

number of parts: 38371


In [17]:
# ssh tunnel from local jupyter notebook to server running elasticsearch
# ssh -N -L 9200:localhost:9200 MichaelZhang@tang.ece.utah.edu

from elasticsearch import Elasticsearch
from elasticsearch import ElasticsearchException
from elasticsearch import helpers

es = Elasticsearch(['http://localhost:9200/'], verify_certs=True)

if not es.ping():
    raise ValueError('Connection failed')
    
index_name = 'part'
try:
    es.indices.create(index=index_name)
except ElasticsearchException as error:
    print(error)
    print('deleting and recreating')
    es.indices.delete(index=index_name)
    print('deleted')
    es.indices.create(index=index_name)
    print('recreated')


actions = []
for i in range(len(parts)):
    action = {
        '_index': index_name,
        '_type': 'part',
        '_id': i,
        '_source': parts[i]
    }
    
    actions.append(action)

print('indexing')
stats = helpers.bulk(es, actions)
if len(stats[1]) == 0:
    print('no errors!')
else:
    print('error_messages: ' + '\n'.join(stats[1]))

PUT http://localhost:9200/part [status:400 request:0.003s]


TransportError(400, 'resource_already_exists_exception', 'index [part/n_5J-cQORzmqrJ8wZTmNWw] already exists')
deleting and recreating
deleted
recreated
indexing
no errors!


In [18]:
from elasticsearch_dsl import Search

query = 'test'

s = Search(using=es, index=index_name).query('multi_match', query=query, fields=['subject', 'displayId', 'version', 'name', 'description', 'type'])
print(s.to_dict())
response = s.execute()

for hit in response:
    print(hit.meta.score, hit.subject)

{'query': {'multi_match': {'query': 'test', 'fields': ['subject', 'displayId', 'version', 'name', 'description', 'type']}}}
8.687418 https://synbiohub.org/public/igem/BBa_K314900/1
7.8440104 https://synbiohub.org/public/igem/BBa_K1479015/1
7.8090644 https://synbiohub.org/public/igem/BBa_K294210/1
7.8090644 https://synbiohub.org/public/igem/BBa_K1479017/1
6.788761 https://synbiohub.org/public/igem/BBa_K1611999/1
6.1728296 https://synbiohub.org/public/igem/BBa_K121998/1
6.1728296 https://synbiohub.org/public/igem/BBa_K194999/1
6.1728296 https://synbiohub.org/public/igem/BBa_K188902/1
6.1728296 https://synbiohub.org/public/igem/BBa_K1073036/1
6.1728296 https://synbiohub.org/public/igem/BBa_K1442684/1
