Skip to content

Commit

Permalink
Fix search index diacritics handling
Browse files Browse the repository at this point in the history
Closes #210: ES asciifolding filter doesn't correctly handle multiple diacritics
  • Loading branch information
bkis committed May 2, 2024
1 parent 8998f31 commit 3a1150f
Show file tree
Hide file tree
Showing 13 changed files with 44 additions and 42 deletions.
2 changes: 1 addition & 1 deletion Tekst-API/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ RUN groupadd -g 1337 tekst && \
useradd -m -u 1337 -g tekst tekst

# copy WSGI config
COPY ./deploy/gunicorn_conf.py ./
COPY ./deploy/gunicorn/gunicorn_conf.py ./
COPY ./deploy/entrypoint.sh /

USER tekst
Expand Down
2 changes: 2 additions & 0 deletions Tekst-API/deploy/elasticsearch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM elasticsearch:8.12.2
RUN elasticsearch-plugin install analysis-icu
File renamed without changes.
5 changes: 4 additions & 1 deletion Tekst-API/dev/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ services:
- dev

es:
image: elasticsearch:8.12.2
build:
context: ../deploy/elasticsearch
tags:
- "elasticsearch:tekst"
deploy:
resources:
limits:
Expand Down
2 changes: 1 addition & 1 deletion Tekst-API/tekst/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def index_doc_props(cls) -> dict[str, Any]:
return dict(
comment={
"type": "text",
"analyzer": "standard_asciifolding",
"analyzer": "standard_no_diacritics",
"fields": {"strict": {"type": "text"}},
},
**cls.rtype_index_doc_props(),
Expand Down
2 changes: 1 addition & 1 deletion Tekst-API/tekst/resources/plain_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def rtype_index_doc_props(cls) -> dict[str, Any]:
return {
"text": {
"type": "text",
"analyzer": "standard_asciifolding",
"analyzer": "standard_no_diacritics",
"fields": {"strict": {"type": "text"}},
},
}
Expand Down
2 changes: 1 addition & 1 deletion Tekst-API/tekst/resources/rich_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def rtype_index_doc_props(cls) -> dict[str, Any]:
return {
"html": {
"type": "text",
"analyzer": "standard_asciifolding",
"analyzer": "standard_no_diacritics",
"fields": {"strict": {"type": "text"}},
},
}
Expand Down
2 changes: 1 addition & 1 deletion Tekst-API/tekst/resources/text_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def rtype_index_doc_props(cls) -> dict[str, Any]:
"properties": {
"token": {
"type": "keyword",
"normalizer": "asciifolding_normalizer",
"normalizer": "no_diacritics_normalizer",
"fields": {"strict": {"type": "keyword"}},
}
},
Expand Down
54 changes: 24 additions & 30 deletions Tekst-API/tekst/search/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,29 @@

IDX_TEMPLATE = {
"aliases": {IDX_ALIAS: {}},
"settings": {
"index": {"number_of_shards": 1, "number_of_replicas": 0},
"analysis": {
"analyzer": {
"standard_no_diacritics": {
"tokenizer": "standard",
"filter": ["no_diacritics", "lowercase"],
},
},
"filter": {
"no_diacritics": {
"type": "icu_transform",
"id": "NFD; [:Nonspacing Mark:] Remove; NFC",
}
},
"normalizer": {
"no_diacritics_normalizer": {
"type": "custom",
"filter": ["no_diacritics", "lowercase"],
},
},
},
},
"mappings": {
"dynamic": False,
"properties": {
Expand All @@ -28,42 +51,13 @@
"path_match": "*.annotations.*",
"mapping": {
"type": "keyword",
"normalizer": "asciifolding_normalizer",
"normalizer": "no_diacritics_normalizer",
"fields": {"strict": {"type": "keyword"}},
},
}
}
],
},
"settings": {
"index": {"number_of_shards": 1, "number_of_replicas": 0},
"analysis": {
"analyzer": {
"standard_asciifolding": {
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"],
},
},
"filter": {
"asciifolding_preserve": {
"type": "asciifolding",
"preserve_original": True,
}
},
"normalizer": {
"asciifolding_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["asciifolding", "lowercase"],
},
"asciifolding_normalizer_preserve_case": {
"type": "custom",
"char_filter": [],
"filter": ["asciifolding"],
},
},
},
},
}

_GENERAL_SOURCE_INCLUDES = [
Expand Down
4 changes: 2 additions & 2 deletions Tekst-API/tekst/utils/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def safe_name(
if isinstance(string, bytes):
string = string.decode()

string = remove_diacritics(string)
string = no_diacritics(string)

# lowercase and delimit using underscores
string = re.sub(r"[^a-z0-9]+", delim, string.lower()).strip(delim)
Expand All @@ -33,7 +33,7 @@ def safe_name(
return string


def remove_diacritics(string: str) -> str:
def no_diacritics(string: str) -> str:
"""Removes diacritics from the input string and returns it NFC-normalized"""
return "".join(
ucdata.normalize("NFC", c)
Expand Down
4 changes: 2 additions & 2 deletions Tekst-API/tests/test_utils_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ def test_safe_name():
assert strings.safe_name("foo bar", delim="+") == "foo+bar"


def test_remove_diacritics():
assert strings.remove_diacritics("Hörglwàartŝ") == "Horglwaarts"
def test_no_diacritics():
assert strings.no_diacritics("Hörglwàartŝ") == "Horglwaarts"
2 changes: 1 addition & 1 deletion Tekst-Web/src/views/ResourcesView.vue
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ async function handleImportClick(resource: AnyResourceRead) {
});
if (!error) {
addTask(data);
message.info($t('contents.msgImportInfo'), undefined, 20);
message.info($t('contents.msgImportInfo'), undefined, 5);
startTasksPolling();
}
actionsLoading.value = false;
Expand Down
5 changes: 4 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ services:
command: mongod --setParameter diagnosticDataCollectionEnabled=false

es:
image: elasticsearch:8.12.2
build:
context: ./Tekst-API/deploy/elasticsearch
tags:
- "elasticsearch:tekst"
deploy:
resources:
limits:
Expand Down

0 comments on commit 3a1150f

Please sign in to comment.