Merge pull request #46 from TranslatorSRI/prototype

Consumes the synonym files created by the synonym_prototype branch of Babel. The input structure is now quite different. Rather than a single document per (curie,label) pair, we now make one doc per curie. We remove the id and length parameters, and add in biolink types and a preferred name. This has an extra benefit of making the lookup code simpler, and reducing 3 solr calls to 1. Fixes #39, fixes #43. Fixes #24 by returning a list sorted by ElasticSearch scores instead of a dictionary. Fixes #32 by adding types and canonical information to NameRes.
TranslatorSRI · Jun 8, 2023 · f853638 · f853638
2 parents 4af3db7 + dafb3c7
commit f853638
Show file tree

Hide file tree

Showing 7 changed files with 1,941 additions and 3,642 deletions.
diff --git a/api/server.py b/api/server.py
@@ -72,16 +72,22 @@ async def lookup_names(
         for curie in request.curies
     }
     for doc in response_json["response"]["docs"]:
-        output[doc["curie"]].append(doc["name"])
+        output[doc["curie"]].extend(doc["names"])
     return output
 
+class LookupResult(BaseModel):
+    curie:str
+    label: str
+    synonyms: List[str]
+    types: List[str]
 
-@app.post("/lookup", response_model=Dict[str, List[str]], tags=["lookup"])
+@app.post("/lookup", response_model=List[LookupResult], tags=["lookup"])
 async def lookup_curies(
         string: str,
         offset: int = 0,
         limit: conint(le=1000) = 10,
-) -> Dict[str, List[str]]:
+        biolink_type: str = None
+) -> List[LookupResult]:
     """Look up curies from name or fragment."""
     #This original code tokenizes on spaces, and then removes all other punctuation.
     # so x-linked becomes xlinked and beta-secretasse becomes betasecretase.
@@ -95,70 +101,35 @@ async def lookup_curies(
     #    for fragment in fragments
     #)
     fragments = re.split(not_alpha,string)
-    name_filters = " AND ".join(
-        f"name:{fragment}*"
+    filters = [
+        # Boost the preferred name by a factor of 10.
+        # Using names:{fragment}* causes Solr to prioritize some odd results;
+        # using names:{fragment} OR names:{fragment}* should cause it to still
+        # include those results while prioritizing complete fragments.
+        f"(preferred_name:{fragment}^10 OR names:{fragment} OR names:{fragment}*)"
         for fragment in fragments if len(fragment) > 0
-    )
+    ]
+    if biolink_type:
+        if biolink_type.startswith('biolink:'):
+            biolink_type = biolink_type[8:]
+        filters.append( f"types:{biolink_type}" )
+    query_filters = " AND ".join(filters)
     query = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select"
     params = {
-        "query": name_filters,
-        "limit": 0,
-        "sort": "length ASC",
-        "facet": {
-            "categories": {
-                "type": "terms",
-                "field": "curie",
-                "sort": "x asc",
-                "offset": offset,
-                "limit": limit,
-                "facet": {
-                    "x": "min(length)",
-                },
-                "numBuckets": True,
-            }
-        }
+        "query": query_filters,
+        "limit": limit,
+        "offset": offset,
+        "fields": "curie,names,preferred_name,types",
     }
     async with httpx.AsyncClient(timeout=None) as client:
         response = await client.post(query, json=params)
     if response.status_code >= 300:
         LOGGER.error("Solr REST error: %s", response.text)
         response.raise_for_status()
     response = response.json()
-    if not response["response"]["numFound"]:
-        return dict()
-    buckets = response["facets"]["categories"]["buckets"]
-
-    curie_filter = " OR ".join(
-        f"curie:\"{bucket['val']}\""
-        for bucket in buckets
-    )
-    params = {
-        "query": f"({curie_filter}) AND ({name_filters})",
-        "limit": 1000000,
-        "sort": "length ASC",
-        "fields": "curie,name",
-    }
-    async with httpx.AsyncClient(timeout=None) as client:
-        response = await client.post(query, json=params)
-    if response.status_code >= 300:
-        LOGGER.error("Solr REST error: %s", response.text)
-        response.raise_for_status()
-    output = defaultdict(list)
-    for doc in response.json()["response"]["docs"]:
-        output[doc["curie"]].append(doc["name"])
-    params = {
-        "query": f"({curie_filter}) AND NOT ({name_filters})",
-        "limit": 1000000,
-        "sort": "length ASC",
-        "fields": "curie,name",
-    }
-    async with httpx.AsyncClient(timeout=None) as client:
-        response = await client.post(query, json=params)
-    if response.status_code >= 300:
-        LOGGER.error("Solr REST error: %s", response.text)
-        response.raise_for_status()
-    for doc in response.json()["response"]["docs"]:
-        output[doc["curie"]].append(doc["name"])
+    output = [ {"curie": doc["curie"], "label":doc["preferred_name"], "synonyms": doc["names"],
+                "types": [f"biolink:{d}" for d in doc["types"]]}
+               for doc in response["response"]["docs"]]
     return output
 
 # Override open api schema with custom schema

diff --git a/data-loading/setup.sh b/data-loading/setup.sh
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1 @@
-requestsfastapihttpxuvicornpyyaml
+requestsfastapihttpxuvicornpyyamljsonlines

diff --git a/setup.sh b/setup.sh
@@ -44,27 +44,34 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
 curl -X POST -H 'Content-type:application/json' --data-binary '{
     "add-field": [
         {
-            "name":"name",
+            "name":"names",
             "type":"LowerTextField",
+            "stored":true,
+            "multiValued":true
+        },
+        {
+            "name":"curie",
+            "type":"string",
             "stored":true
         },
         {
-            "name":"length",
-            "type":"plong",
+            "name":"preferred_name",
+            "type":"string",
             "stored":true
         },
         {
-            "name":"curie",
+            "name":"types",
             "type":"string",
             "stored":true
+            "multiValued":true
         }
     ] }' 'http://localhost:8983/solr/name_lookup/schema'
 
 # add data
 for f in $1; do
 echo "Loading $f..."
 curl -X POST -H 'Content-Type: application/json' -d @$f \
-    'http://localhost:8983/solr/name_lookup/update?processor=uuid&uuid.fieldName=id&commit=true'
+    'http://localhost:8983/solr/name_lookup/update/json/docs?processor=uuid&uuid.fieldName=id&commit=true'
 done
 echo "Check solr"
 curl -s --negotiate -u: 'localhost:8983/solr/name_lookup/query?q=*:*&rows=0'