Skip to content

Commit

Permalink
Merge pull request #431 from UCL/feature/search-diacriticals
Browse files Browse the repository at this point in the history
Feature/search diacriticals
  • Loading branch information
acholyn committed Mar 26, 2024
2 parents 786eb53 + 3e4e3b0 commit 6c1c399
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 1 deletion.
98 changes: 98 additions & 0 deletions src/rard/research/migrations/0070_updating_plain_renditions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Generated by Django 3.2 on 2024-03-08 12:27

import re
import string
import unicodedata
from django.db import migrations
from django.utils.html import strip_tags


def strip_combining_and_make_plain(content):
plain_text = make_plain_text(content)
normalized = unicodedata.normalize("NFD", plain_text)
return "".join([char for char in normalized if not unicodedata.combining(char)])


def make_plain_text(content):
no_ufeff = content.replace("\ufeff", "") # found around mentions for some reason
no_tags = strip_tags(no_ufeff.replace("><", "> <"))
no_html_chars = re.sub(r"&\w+;", " ", no_tags)
no_punctuation = no_html_chars.translate(str.maketrans("", "", string.punctuation))
no_lone_numbers = re.sub(r"\s\d{1,2}\s", " ", no_punctuation) # mentions
no_excess_space = re.sub(r" +", " ", no_lone_numbers)
return no_excess_space


def update_antiquarians(apps, schema_editor, plaining_function):
Antiquarian = apps.get_model("research", "Antiquarian")
for ant in Antiquarian.objects.all():
if ant.plain_introduction:
ant.plain_introduction = plaining_function(ant.introduction.content)
ant.save()


def update_works(apps, schema_editor, plaining_function):
Work = apps.get_model("research", "Work")
for wk in Work.objects.all():
if wk.plain_introduction:
wk.plain_introduction = plaining_function(wk.introduction.content)
wk.save()


def update_fragments(apps, schema_editor, plaining_function):
Fragment = apps.get_model("research", "Fragment")
for frag in Fragment.objects.all():
if frag.plain_commentary:
frag.plain_commentary = plaining_function(frag.commentary.content)
frag.save()


def update_anonymous_fragments(apps, schema_editor, plaining_function):
AnonymousFragment = apps.get_model("research", "AnonymousFragment")
for afrag in AnonymousFragment.objects.all():
if afrag.plain_commentary:
afrag.plain_commentary = plaining_function(afrag.commentary.content)
afrag.save()


def update_testimonia(apps, schema_editor, plaining_function):
Testimonium = apps.get_model("research", "Testimonium")
for tes in Testimonium.objects.all():
if tes.plain_commentary:
tes.plain_commentary = plaining_function(tes.commentary.content)
tes.save()


def update_original_texts(apps, schema_editor, plaining_function):
OriginalText = apps.get_model("research", "OriginalText")
for ot in OriginalText.objects.all():
if ot.plain_content:
ot.plain_content = plaining_function(ot.content)
if ot.plain_translated_text:
ot.plain_translated_text = plaining_function(ot.translated_text)
ot.save()


def update_plain_texts(apps, schema_editor):
update_antiquarians(apps, schema_editor, strip_combining_and_make_plain)
update_works(apps, schema_editor, strip_combining_and_make_plain)
update_fragments(apps, schema_editor, strip_combining_and_make_plain)
update_anonymous_fragments(apps, schema_editor, strip_combining_and_make_plain)
update_testimonia(apps, schema_editor, strip_combining_and_make_plain)


def revert_plain_texts(apps, schema_editor):
update_antiquarians(apps, schema_editor, make_plain_text)
update_works(apps, schema_editor, make_plain_text)
update_fragments(apps, schema_editor, make_plain_text)
update_anonymous_fragments(apps, schema_editor, make_plain_text)
update_testimonia(apps, schema_editor, make_plain_text)


class Migration(migrations.Migration):

dependencies = [
("research", "0069_anonymousfragment_anonymous_fragments"),
]

operations = [migrations.RunPython(update_plain_texts, revert_plain_texts)]
28 changes: 28 additions & 0 deletions src/rard/research/tests/views/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,31 @@ def do_search(search_function, keywords):
self.assertEqual(do_search(view.antiquarian_search, "interesting"), [a11])
self.assertEqual(do_search(view.work_search, "interesting"), [w11])
self.assertEqual(do_search(view.book_search, "interesting"), [b11])

def test_unicode_stripped(self):
def do_search(search_function, keywords):
return list(search_function(SearchView.Term(keywords)))

view = SearchView()
cw = CitingWork.objects.create(title="citing_work")
a12 = Antiquarian.objects.create(name="boring person", re_code="11")
a12.introduction.content = "ÂÑtĩqūě"

f3 = Fragment.objects.create()
f3.commentary = TextObjectField.objects.create(content="Ĺĭōṇęḷ ώὰs Ᾰ gΌῸd ὲgg")

ot3 = OriginalText.objects.create(
content="Õh Ṭọ hάvë pļāĭñ těxt",
citing_work=cw,
owner=f3,
)

Translation.objects.create(original_text=ot3, translated_text="Ὼῤῥ")

a12.save()
f3.save()

self.assertEqual(do_search(view.antiquarian_search, "antique"), [a12])
self.assertEqual(do_search(view.fragment_search, "lionel"), [f3])
self.assertEqual(do_search(view.fragment_search, "plain"), [f3])
self.assertEqual(do_search(view.fragment_search, "ωρρ"), [f3])
12 changes: 11 additions & 1 deletion src/rard/utils/text_processors.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import re
import string
import unicodedata

from django.utils.html import strip_tags


def strip_combining(content):
"""Converts the content to their base and combining characters,
then removes the combining ones and returns a string of the base characters
"""
normalized = unicodedata.normalize("NFD", content)
return "".join([char for char in normalized if not unicodedata.combining(char)])


def make_plain_text(content):
no_ufeff = content.replace("\ufeff", "") # found around mentions for some reason
no_unicode = strip_combining(content)
no_ufeff = no_unicode.replace("\ufeff", "") # found around mentions for some reason
# Add a space between tags so adjacent words aren't merged
no_tags = strip_tags(no_ufeff.replace("><", "> <"))
no_html_chars = re.sub(r"&\w+;", " ", no_tags)
Expand Down

0 comments on commit 6c1c399

Please sign in to comment.