Merge pull request #431 from UCL/feature/search-diacriticals

Feature/search diacriticals
UCL · Mar 26, 2024 · 6c1c399 · 6c1c399
2 parents 786eb53 + 3e4e3b0
commit 6c1c399
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 1 deletion.
diff --git a/src/rard/research/migrations/0070_updating_plain_renditions.py b/src/rard/research/migrations/0070_updating_plain_renditions.py
@@ -0,0 +1,98 @@
+# Generated by Django 3.2 on 2024-03-08 12:27
+
+import re
+import string
+import unicodedata
+from django.db import migrations
+from django.utils.html import strip_tags
+
+
+def strip_combining_and_make_plain(content):
+    plain_text = make_plain_text(content)
+    normalized = unicodedata.normalize("NFD", plain_text)
+    return "".join([char for char in normalized if not unicodedata.combining(char)])
+
+
+def make_plain_text(content):
+    no_ufeff = content.replace("\ufeff", "")  # found around mentions for some reason
+    no_tags = strip_tags(no_ufeff.replace("><", "> <"))
+    no_html_chars = re.sub(r"&\w+;", " ", no_tags)
+    no_punctuation = no_html_chars.translate(str.maketrans("", "", string.punctuation))
+    no_lone_numbers = re.sub(r"\s\d{1,2}\s", " ", no_punctuation)  # mentions
+    no_excess_space = re.sub(r" +", " ", no_lone_numbers)
+    return no_excess_space
+
+
+def update_antiquarians(apps, schema_editor, plaining_function):
+    Antiquarian = apps.get_model("research", "Antiquarian")
+    for ant in Antiquarian.objects.all():
+        if ant.plain_introduction:
+            ant.plain_introduction = plaining_function(ant.introduction.content)
+            ant.save()
+
+
+def update_works(apps, schema_editor, plaining_function):
+    Work = apps.get_model("research", "Work")
+    for wk in Work.objects.all():
+        if wk.plain_introduction:
+            wk.plain_introduction = plaining_function(wk.introduction.content)
+            wk.save()
+
+
+def update_fragments(apps, schema_editor, plaining_function):
+    Fragment = apps.get_model("research", "Fragment")
+    for frag in Fragment.objects.all():
+        if frag.plain_commentary:
+            frag.plain_commentary = plaining_function(frag.commentary.content)
+            frag.save()
+
+
+def update_anonymous_fragments(apps, schema_editor, plaining_function):
+    AnonymousFragment = apps.get_model("research", "AnonymousFragment")
+    for afrag in AnonymousFragment.objects.all():
+        if afrag.plain_commentary:
+            afrag.plain_commentary = plaining_function(afrag.commentary.content)
+            afrag.save()
+
+
+def update_testimonia(apps, schema_editor, plaining_function):
+    Testimonium = apps.get_model("research", "Testimonium")
+    for tes in Testimonium.objects.all():
+        if tes.plain_commentary:
+            tes.plain_commentary = plaining_function(tes.commentary.content)
+            tes.save()
+
+
+def update_original_texts(apps, schema_editor, plaining_function):
+    OriginalText = apps.get_model("research", "OriginalText")
+    for ot in OriginalText.objects.all():
+        if ot.plain_content:
+            ot.plain_content = plaining_function(ot.content)
+        if ot.plain_translated_text:
+            ot.plain_translated_text = plaining_function(ot.translated_text)
+        ot.save()
+
+
+def update_plain_texts(apps, schema_editor):
+    update_antiquarians(apps, schema_editor, strip_combining_and_make_plain)
+    update_works(apps, schema_editor, strip_combining_and_make_plain)
+    update_fragments(apps, schema_editor, strip_combining_and_make_plain)
+    update_anonymous_fragments(apps, schema_editor, strip_combining_and_make_plain)
+    update_testimonia(apps, schema_editor, strip_combining_and_make_plain)
+
+
+def revert_plain_texts(apps, schema_editor):
+    update_antiquarians(apps, schema_editor, make_plain_text)
+    update_works(apps, schema_editor, make_plain_text)
+    update_fragments(apps, schema_editor, make_plain_text)
+    update_anonymous_fragments(apps, schema_editor, make_plain_text)
+    update_testimonia(apps, schema_editor, make_plain_text)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("research", "0069_anonymousfragment_anonymous_fragments"),
+    ]
+
+    operations = [migrations.RunPython(update_plain_texts, revert_plain_texts)]
diff --git a/src/rard/research/tests/views/test_search.py b/src/rard/research/tests/views/test_search.py
@@ -548,3 +548,31 @@ def do_search(search_function, keywords):
         self.assertEqual(do_search(view.antiquarian_search, "interesting"), [a11])
         self.assertEqual(do_search(view.work_search, "interesting"), [w11])
         self.assertEqual(do_search(view.book_search, "interesting"), [b11])
+
+    def test_unicode_stripped(self):
+        def do_search(search_function, keywords):
+            return list(search_function(SearchView.Term(keywords)))
+
+        view = SearchView()
+        cw = CitingWork.objects.create(title="citing_work")
+        a12 = Antiquarian.objects.create(name="boring person", re_code="11")
+        a12.introduction.content = "ÂÑtĩqūě"
+
+        f3 = Fragment.objects.create()
+        f3.commentary = TextObjectField.objects.create(content="Ĺĭōṇęḷ ώὰs Ᾰ gΌῸd ὲgg")
+
+        ot3 = OriginalText.objects.create(
+            content="Õh Ṭọ hάvë pļāĭñ těxt",
+            citing_work=cw,
+            owner=f3,
+        )
+
+        Translation.objects.create(original_text=ot3, translated_text="Ὼῤῥ")
+
+        a12.save()
+        f3.save()
+
+        self.assertEqual(do_search(view.antiquarian_search, "antique"), [a12])
+        self.assertEqual(do_search(view.fragment_search, "lionel"), [f3])
+        self.assertEqual(do_search(view.fragment_search, "plain"), [f3])
+        self.assertEqual(do_search(view.fragment_search, "ωρρ"), [f3])
diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py
@@ -1,11 +1,21 @@
 import re
 import string
+import unicodedata
 
 from django.utils.html import strip_tags
 
 
+def strip_combining(content):
+    """Converts the content to their base and combining characters,
+    then removes the combining ones and returns a string of the base characters
+    """
+    normalized = unicodedata.normalize("NFD", content)
+    return "".join([char for char in normalized if not unicodedata.combining(char)])
+
+
 def make_plain_text(content):
-    no_ufeff = content.replace("\ufeff", "")  # found around mentions for some reason
+    no_unicode = strip_combining(content)
+    no_ufeff = no_unicode.replace("\ufeff", "")  # found around mentions for some reason
     # Add a space between tags so adjacent words aren't merged
     no_tags = strip_tags(no_ufeff.replace("><", "> <"))
     no_html_chars = re.sub(r"&\w+;", " ", no_tags)