Skip to content

Commit

Permalink
Merge pull request #46 from TranslatorSRI/prototype
Browse files Browse the repository at this point in the history
Consumes the synonym files created by the synonym_prototype branch of Babel.

The input structure is now quite different.  Rather than a single document per (curie,label) pair, we now make one doc per curie.  We remove the id and length parameters, and add in biolink types and a preferred name.

This has an extra benefit of making the lookup code simpler, and reducing 3 solr calls to 1.

Fixes #39, fixes #43.
Fixes #24 by returning a list sorted by ElasticSearch scores instead of a dictionary.
Fixes #32 by adding types and canonical information to NameRes.
  • Loading branch information
gaurav committed Jun 8, 2023
2 parents 4af3db7 + dafb3c7 commit f853638
Show file tree
Hide file tree
Showing 7 changed files with 1,941 additions and 3,642 deletions.
85 changes: 28 additions & 57 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,22 @@ async def lookup_names(
for curie in request.curies
}
for doc in response_json["response"]["docs"]:
output[doc["curie"]].append(doc["name"])
output[doc["curie"]].extend(doc["names"])
return output

class LookupResult(BaseModel):
curie:str
label: str
synonyms: List[str]
types: List[str]

@app.post("/lookup", response_model=Dict[str, List[str]], tags=["lookup"])
@app.post("/lookup", response_model=List[LookupResult], tags=["lookup"])
async def lookup_curies(
string: str,
offset: int = 0,
limit: conint(le=1000) = 10,
) -> Dict[str, List[str]]:
biolink_type: str = None
) -> List[LookupResult]:
"""Look up curies from name or fragment."""
#This original code tokenizes on spaces, and then removes all other punctuation.
# so x-linked becomes xlinked and beta-secretasse becomes betasecretase.
Expand All @@ -95,70 +101,35 @@ async def lookup_curies(
# for fragment in fragments
#)
fragments = re.split(not_alpha,string)
name_filters = " AND ".join(
f"name:{fragment}*"
filters = [
# Boost the preferred name by a factor of 10.
# Using names:{fragment}* causes Solr to prioritize some odd results;
# using names:{fragment} OR names:{fragment}* should cause it to still
# include those results while prioritizing complete fragments.
f"(preferred_name:{fragment}^10 OR names:{fragment} OR names:{fragment}*)"
for fragment in fragments if len(fragment) > 0
)
]
if biolink_type:
if biolink_type.startswith('biolink:'):
biolink_type = biolink_type[8:]
filters.append( f"types:{biolink_type}" )
query_filters = " AND ".join(filters)
query = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select"
params = {
"query": name_filters,
"limit": 0,
"sort": "length ASC",
"facet": {
"categories": {
"type": "terms",
"field": "curie",
"sort": "x asc",
"offset": offset,
"limit": limit,
"facet": {
"x": "min(length)",
},
"numBuckets": True,
}
}
"query": query_filters,
"limit": limit,
"offset": offset,
"fields": "curie,names,preferred_name,types",
}
async with httpx.AsyncClient(timeout=None) as client:
response = await client.post(query, json=params)
if response.status_code >= 300:
LOGGER.error("Solr REST error: %s", response.text)
response.raise_for_status()
response = response.json()
if not response["response"]["numFound"]:
return dict()
buckets = response["facets"]["categories"]["buckets"]

curie_filter = " OR ".join(
f"curie:\"{bucket['val']}\""
for bucket in buckets
)
params = {
"query": f"({curie_filter}) AND ({name_filters})",
"limit": 1000000,
"sort": "length ASC",
"fields": "curie,name",
}
async with httpx.AsyncClient(timeout=None) as client:
response = await client.post(query, json=params)
if response.status_code >= 300:
LOGGER.error("Solr REST error: %s", response.text)
response.raise_for_status()
output = defaultdict(list)
for doc in response.json()["response"]["docs"]:
output[doc["curie"]].append(doc["name"])
params = {
"query": f"({curie_filter}) AND NOT ({name_filters})",
"limit": 1000000,
"sort": "length ASC",
"fields": "curie,name",
}
async with httpx.AsyncClient(timeout=None) as client:
response = await client.post(query, json=params)
if response.status_code >= 300:
LOGGER.error("Solr REST error: %s", response.text)
response.raise_for_status()
for doc in response.json()["response"]["docs"]:
output[doc["curie"]].append(doc["name"])
output = [ {"curie": doc["curie"], "label":doc["preferred_name"], "synonyms": doc["names"],
"types": [f"biolink:{d}" for d in doc["types"]]}
for doc in response["response"]["docs"]]
return output

# Override open api schema with custom schema
Expand Down
71 changes: 0 additions & 71 deletions data-loading/setup.sh

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
requestsfastapihttpxuvicornpyyaml
requestsfastapihttpxuvicornpyyamljsonlines
Expand Down
17 changes: 12 additions & 5 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,34 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field": [
{
"name":"name",
"name":"names",
"type":"LowerTextField",
"stored":true,
"multiValued":true
},
{
"name":"curie",
"type":"string",
"stored":true
},
{
"name":"length",
"type":"plong",
"name":"preferred_name",
"type":"string",
"stored":true
},
{
"name":"curie",
"name":"types",
"type":"string",
"stored":true
"multiValued":true
}
] }' 'http://localhost:8983/solr/name_lookup/schema'

# add data
for f in $1; do
echo "Loading $f..."
curl -X POST -H 'Content-Type: application/json' -d @$f \
'http://localhost:8983/solr/name_lookup/update?processor=uuid&uuid.fieldName=id&commit=true'
'http://localhost:8983/solr/name_lookup/update/json/docs?processor=uuid&uuid.fieldName=id&commit=true'
done
echo "Check solr"
curl -s --negotiate -u: 'localhost:8983/solr/name_lookup/query?q=*:*&rows=0'
Expand Down
Loading

0 comments on commit f853638

Please sign in to comment.