Merge branch 'release/5.8.0'

UUDigitalHumanitieslab · Jun 19, 2024 · abefcc1 · abefcc1
2 parents 4cae955 + 4084a98
commit abefcc1
Show file tree

Hide file tree

Showing 58 changed files with 3,914 additions and 3,456 deletions.
diff --git a/.nvmrc b/.nvmrc
@@ -0,0 +1 @@
+18.17.1
diff --git a/CITATION.cff b/CITATION.cff
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.7.0
-date-released: '2024-06-5'
+version: 5.8.0
+date-released: '2024-06-19'
diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -72,7 +72,7 @@ def sources(self, start=min_date, end=max_date):
                 full_path = op.join(directory, filename)
                 file_path = op.join(rel_dir, filename)
                 image_path = op.join(
-                    rel_dir, name + '.' + self.scan_image_type)
+                    rel_dir, name + '.pdf')
                 if extension != '.xml':
                     logger.debug(self.non_xml_msg.format(full_path))
                     continue

diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
@@ -74,7 +74,7 @@ def sources(self, start=min_date, end=max_date):
                                 self.definition_pattern.search(filename)), None)
             if not definition_file:
                 continue
-            meta_dict = self.metadata_from_xml(definition_file, tags=[
+            meta_dict = self._metadata_from_xml(definition_file, tags=[
                     "title",
                     "date",
                     "publisher",

diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
@@ -91,7 +91,7 @@ def sources(self, start=min_date, end=max_date):
                             'Volume'
                         ]
 
-                        meta_dict = self.metadata_from_xml(
+                        meta_dict = self._metadata_from_xml(
                             full_path, tags=meta_tags)
                         meta_dict['id'] = record_id
                         meta_dict['category'] = category

diff --git a/backend/corpora/ublad/description/ublad.md b/backend/corpora/ublad/description/ublad.md
@@ -0,0 +1,7 @@
+Op 5 september 1969 kreeg de Universiteit Utrecht voor het eerst een onafhankelijk blad: _U utrechtse universitaire reflexen_. Dit blad kwam voort uit een fusie van twee andere tijdschriften: _Sol Iustitiae_ dat voornamelijk gericht was op studenten en _Solaire Reflexen_ dat meer was bedoeld voor medewerkers. U utrechtse universitaire reflexen was bedoeld voor alle geledingen.
+
+In 1974 veranderde de naam in het _Ublad_. Dat bleef zo tot de universiteit besloot het papieren Ublad digitaal te maken. Onder luid protest verdween het papieren Ublad en ontstond in april 2010 _DUB_, het digitale universiteitsblad.
+
+Om alle informatie uit het verleden toegankelijk te maken, heeft het Centre for Digital Humanities samen met de Universiteitsbibliotheek de oude jaargangen gedigitaliseerd. In I-analyzer kunt u alle jaargangen van U utrechtse universitaire reflexen en het Ublad vinden en doorzoeken.
+
+Het onafhankelijke Ublad geeft een kleurrijk verslag van wat er speelde op de universiteit, de stad en het studentenleven door middel van artikelen, foto’s en cartoons. De afbeelding die is gebruikt voor OCR is voor elke pagina bijgevoegd zodat u altijd het originele bronmateriaal kunt raadplegen.
diff --git a/backend/corpora/ublad/images/ublad.jpg b/backend/corpora/ublad/images/ublad.jpg
diff --git a/backend/corpora/ublad/tests/test_ublad.py b/backend/corpora/ublad/tests/test_ublad.py
@@ -0,0 +1,14 @@
+import locale
+import pytest
+from corpora.ublad.ublad import transform_date
+import datetime
+
+
+def test_transform_date():
+    datestring = '6 september 2007'
+    goal_date = datetime.date(2007, 9, 6)
+    try:
+        date = transform_date(datestring)
+    except locale.Error:
+        pytest.skip('Dutch Locale not installed in environment')
+    assert date == str(goal_date)
diff --git a/backend/corpora/ublad/ublad.py b/backend/corpora/ublad/ublad.py
@@ -0,0 +1,264 @@
+from datetime import datetime
+import os
+from os.path import join, splitext
+import locale
+import logging
+
+from django.conf import settings
+from addcorpus.python_corpora.corpus import HTMLCorpusDefinition, FieldDefinition
+from addcorpus.python_corpora.extract import FilterAttribute
+from addcorpus.es_mappings import *
+from addcorpus.python_corpora.filters import DateFilter
+from addcorpus.es_settings import es_settings
+
+
+from ianalyzer_readers.readers.html import HTMLReader
+from ianalyzer_readers.readers.core import Field
+from ianalyzer_readers.extract import html, Constant
+
+from bs4 import BeautifulSoup, Tag
+
+def transform_content(soup):
+    """
+    Transforms the text contents of a page node (soup) into a string consisting
+    of blocks of text, foregoing the column structure of the OCR'ed material.
+    """
+    page_text = ""
+    for child in soup.children:
+        if isinstance(child, Tag) and 'ocr_carea' in child.get('class', []):
+            paragraph_text = ""
+            paragraph_list = child.get_text().split('\n')
+            for item in paragraph_list[1:]:
+                if not item:
+                    pass
+                elif item.endswith('-'):
+                    paragraph_text += item.strip('-')
+                else:
+                    paragraph_text += item + ' '
+            if paragraph_text:
+                page_text += paragraph_text + '\n\n'
+    return page_text
+
+def transform_date(date_string):
+    try:
+        locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8')
+        date = datetime.strptime(date_string, '%d %B %Y').strftime('%Y-%m-%d')
+        locale.setlocale(locale.LC_ALL, '')
+        return date
+    except ValueError:
+        logger.error("Unable to get date from {}".format(date_string))
+        return None
+
+
+logger = logging.getLogger('indexing')
+
+class UBlad(HTMLCorpusDefinition):
+    title = 'U-Blad'
+    description = 'The print editions of the Utrecht University paper from 1969 until 2010.'
+    description_page = 'ublad.md'
+    min_date = datetime(year=1969, month=1, day=1)
+    max_date = datetime(year=2010, month=12, day=31)
+
+    data_directory = settings.UBLAD_DATA
+    es_index = getattr(settings, 'UBLAD_ES_INDEX', 'ublad')
+    image = 'ublad.jpg'
+    scan_image_type = 'image/jpeg'
+    allow_image_download = True
+
+    document_context = {
+        'context_fields': ['volume_id'],
+        'sort_field': 'sequence',
+        'sort_direction': 'asc',
+        'context_display_name': 'volume'
+    }
+
+    languages = ['nl']
+    category = 'periodical'
+
+    @property
+    def es_settings(self):
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
+
+    def sources(self, start=min_date, end=max_date):
+        for directory, _, filenames in os.walk(self.data_directory):
+            _body, tail = os.path.split(directory)
+            if '.snapshot' in _:
+                _.remove('.snapshot')
+                continue
+            for filename in filenames:
+                if filename != '.DS_Store':
+                    full_path = join(directory, filename)
+                    yield full_path, {'filename': filename}
+
+
+    fields = [
+        FieldDefinition(
+            name = 'content',
+            display_name='Content',
+            display_type='text_content',
+            description='Text content of the page, generated by OCR',
+            results_overview=True,
+            csv_core=True,
+            search_field_core=True,
+            visualizations=['ngram', 'wordcloud'],
+            es_mapping = main_content_mapping(True, True, True, 'nl'),
+            extractor= FilterAttribute(tag='div',
+                                       recursive=True,
+                                       multiple=False,
+                                       flatten=False,
+                                       extract_soup_func=transform_content,
+                                       attribute_filter={
+                                            'attribute': 'class',
+                                            'value': 'ocr_page'
+                                        })
+        ),
+        FieldDefinition(
+            name='pagenum',
+            display_name='Page number',
+            description='Page number',
+            csv_core=True,
+            es_mapping = int_mapping(),
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'pagenum'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='journal_title',
+            display_name='Publication Title',
+            description='Title of the publication',
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'journal_title'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='volume_id',
+            display_name='Volume ID',
+            description='Unique identifier for this volume',
+            hidden=True,
+            es_mapping=keyword_mapping(),
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'identifier_ocn'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='id',
+            display_name='Page ID',
+            description='Unique identifier for this page',
+            hidden=True,
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'identifier_indexid'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='edition',
+            display_name='Edition',
+            description='The number of the edition in this volume. Every year starts at 1.',
+            sortable=True,
+            es_mapping = keyword_mapping(),
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'aflevering'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='volume',
+            display_name='Volume',
+            sortable=True,
+            results_overview=True,
+            csv_core=True,
+            description='The volume number of this publication. There is one volume per year.',
+            es_mapping=keyword_mapping(),
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'yearstring'
+                }
+            ),
+        ),
+        FieldDefinition(
+            name='date',
+            display_name='Date',
+            description='The publication date of this edition',
+            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            visualizations=['resultscount', 'termfrequency'],
+            sortable=True,
+            results_overview=True,
+            search_filter=DateFilter(
+                min_date,
+                max_date,
+                description=(
+                    'Accept only articles with publication date in this range.'
+                )
+            ),
+            extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'datestring',
+                },
+                transform=transform_date
+            )
+        ),
+        FieldDefinition(
+            name='repo_url',
+            display_name='Repository URL',
+            description='URL to the dSPACE repository entry of this volume',
+            es_mapping=keyword_mapping(),
+            display_type='url',
+            searchable=False,
+            extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'link_repository'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='reader_url',
+            display_name='Reader URL',
+            description='URL to the UB reader view of this page',
+            es_mapping=keyword_mapping(),
+            display_type='url',
+            searchable=False,
+            extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'link_objects_image'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='jpg_url',
+            display_name='Image URL',
+            description='URL to the jpg file of this page',
+            es_mapping=keyword_mapping(),
+            display_type='url',
+            searchable=False,
+            extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'link_objects_jpg'
+                }
+            )
+        ),
+        FieldDefinition(
+            name='worldcat_url',
+            display_name='Worldcat URL',
+            description='URL to the Worldcat entry of this volume',
+            es_mapping=keyword_mapping(),
+            display_type='url',
+            searchable=False,
+            extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={
+                'attribute': 'name',
+                'value': 'link_worldcat'
+                }
+            )
+        )
+    ]
+
+    def request_media(self, document, corpus_name):
+        image_list = [document['fieldValues']['jpg_url']]
+        return {'media': image_list}
diff --git a/backend/es/es_index.py b/backend/es/es_index.py
@@ -130,6 +130,7 @@ def populate(client: Elasticsearch, corpus: Corpus, start=None, end=None):
 
     corpus_server = settings.SERVERS[
         settings.CORPUS_SERVER_NAMES.get(corpus_name, 'default')]
+
     # Do bulk operation
     for success, info in es_helpers.streaming_bulk(
         client,

diff --git a/backend/ianalyzer/settings.py b/backend/ianalyzer/settings.py
@@ -76,6 +76,8 @@
 
 CORPUS_SERVER_NAMES = {}
 
+CORPORA_LOCALES = {}
+
 CORPORA = {}
 
 WORDCLOUD_LIMIT = 1000

diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py
@@ -17,4 +17,6 @@ def test_corpus_path(*path):
 TIMES_DATA = os.path.join(BASE_DIR, 'addcorpus', 'python_corpora', 'tests')
 TIMES_ES_INDEX = 'times-test'
 
+UBLAD_DATA = '' # necessary to make ublad test not fail
+
 SERVERS['default']['index_prefix'] = 'test'
diff --git a/backend/media/image_processing.py b/backend/media/image_processing.py
@@ -47,7 +47,7 @@ def retrieve_pdf(path):
     '''
     Retrieve the pdf as a file object.
     '''
-    pdf = PdfReader(path, 'rb')
+    pdf = PdfReader(path)
 
     return pdf
 

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -377,7 +377,7 @@ threadpoolctl==3.2.0
     # via scikit-learn
 tomli==2.0.1
     # via pytest
-tornado==6.3.3
+tornado==6.4.1
     # via
     #   django-livereload-server
     #   flower

diff --git a/documentation/Django-project-settings.md b/documentation/Django-project-settings.md
@@ -47,11 +47,17 @@ The values in the dictionary give specifications.
 - `'scroll_timeout'`: Time before scroll results time out
 - `'scroll_page_size'`: Number of results per scroll page
 
-The following optional settings are implemented but have no documentation:
+### API key
 
-- `'certs_location'`
-- `'api_key'`
-- `'api_id'`
+By default, an elasticsearch server will have security features enabled; you can turn this off for a local development server (see [first-time setup](./First-time-setup.md)). Otherwise, the server configuration must specify an API key.
+
+Create an API key for the server: see [creating an API key](https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html). Note down the `'id'` and `'api_key'` values of the response.
+
+Add the following values to the configuration:
+
+- `'certs_location'`: Fill in the following path: `{your_elasticsearch_directory}/config/certs/http_ca.crt`
+- `'api_id'`: the ID of the API key
+- `'api_key'`: the generated API key
 
 
 #### Setting a default server