Merge branch 'develop' into feature/ublad-corpus

UUDigitalHumanitieslab · Jun 19, 2024 · 4333b44 · 4333b44
2 parents 78d64bc + 3ebd88a
commit 4333b44
Show file tree

Hide file tree

Showing 113 changed files with 4,194 additions and 3,747 deletions.
diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml
@@ -0,0 +1,27 @@
+# This workflow will run backend tests on the Python version defined in the Dockerfiles
+
+name: Backend unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'develop'
+      - 'master'
+      - 'feature/**'
+      - 'bugfix/**'
+      - 'hotfix/**'
+      - 'release/**'
+      - 'dependabot/**'
+    paths-ignore:
+      - 'frontend/**'
+      - '**.md'
+
+jobs:
+  backend-test:
+    name: Test Backend
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run backend tests
+      run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest
diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml
@@ -0,0 +1,27 @@
+# This workflow will run frontend tests on the Node version defined in the Dockerfiles
+
+name: Frontend unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'develop'
+      - 'master'
+      - 'feature/**'
+      - 'bugfix/**'
+      - 'hotfix/**'
+      - 'release/**'
+      - 'dependabot/**'
+    paths-ignore:
+      - 'backend/**'
+      - '**.md'
+
+jobs:
+  frontend-test:
+    name: Test Frontend
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run frontend tests
+      run: sudo docker-compose --env-file .env-ci run frontend yarn test
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,25 @@
+# This action will update the CITATION.cff file for new release or hotfix branches
+
+name: Release
+
+on:
+  push:
+    branches:
+      - 'release/**'
+      - 'hotfix/**'
+
+jobs:
+  citation-update:
+    name: Update CITATION.cff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Autoformat CITATION.cff
+        run: |
+          version=`grep -o '\d\+\.\d\+\.\d\+' package.json`
+          today=`date +"%Y-%m-%d"`
+          sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff
+          sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff
+          bash ./update-citation.sh
+          git commit -a -m "update version and date in CITATION.cff"
+
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/.nvmrc b/.nvmrc
@@ -0,0 +1 @@
+18.17.1
diff --git a/CITATION.cff b/CITATION.cff
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.6.2
-date-released: '2024-05-06'
+version: 5.7.0
+date-released: '2024-06-5'
diff --git a/DockerfileElastic b/DockerfileElastic
@@ -0,0 +1,3 @@
+FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2
+
+RUN bin/elasticsearch-plugin install mapper-annotated-text
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ For corpora included in I-analyzer, the backend includes a definition file that
 
 ## Usage
 
-If you are interested in using I-analyzer, the most straightforward way to get started is to make an account at [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament  (not publicly accessible)](https://people-and-parliament.hum.uu.nl/).
+If you are interested in using I-analyzer, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
 
 I-analyzer does not have an "upload data" option (yet!). If you are interested in using I-analyzer as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
 

diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py
@@ -12,7 +12,6 @@ def test_json_corpus_import(db, json_corpus_data):
     corpus = serializer.create(serializer.validated_data)
 
     assert corpus.name == 'example'
-    assert corpus.ready_to_index()
 
     config = corpus.configuration
 

diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
@@ -11,10 +11,9 @@
     validate_source_data_directory,
 )
 from addcorpus.validation.indexing import (validate_essential_fields,
-                                           validate_has_configuration,
-                                           validate_language_field)
+    validate_has_configuration, validate_language_field, validate_has_data_directory)
 from addcorpus.validation.publishing import (validate_default_sort,
-                                             validate_ngram_has_date_field)
+    validate_ngram_has_date_field)
 from django.contrib import admin
 from django.contrib.auth.models import Group
 from django.contrib.postgres.fields import ArrayField
@@ -92,6 +91,7 @@ def validate_ready_to_index(self) -> None:
         config = self.configuration_obj
         fields = config.fields.all()
 
+        validate_has_data_directory(self)
         validate_essential_fields(fields)
         validate_language_field(self)
 
@@ -111,12 +111,21 @@ def validate_ready_to_publish(self) -> None:
         '''
         Validation that should be carried out before making the corpus public.
 
+        This also includes most checks that are needed to create an index, but not all
+        (if the index already exists, you do not need source data).
+
         Raises:
             CorpusNotIndexableError: the corpus is not meeting requirements for indexing.
             CorpusNotPublishableError: interface options are improperly configured.
         '''
 
-        self.validate_ready_to_index()
+        validate_has_configuration(self)
+
+        config = self.configuration_obj
+        fields = config.fields.all()
+
+        validate_essential_fields(fields)
+        validate_language_field(self)
         validate_ngram_has_date_field(self)
         validate_default_sort(self)
 

diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py
@@ -22,6 +22,7 @@
 
 logger = logging.getLogger('indexing')
 
+
 class CorpusDefinition(Reader):
     '''
     Subclasses of this class define corpora and their documents by specifying:
@@ -161,7 +162,7 @@ def word_models_present(self):
         '''
         if word models are present for this corpus
         '''
-        return self.word_model_path != None and isdir(self.word_model_path)
+        return self.word_model_path is not None and isdir(self.word_model_path)
 
     @property
     def new_highlight(self):
@@ -173,7 +174,7 @@ def new_highlight(self):
         '''
         try:
             highlight_corpora = settings.NEW_HIGHLIGHT_CORPORA
-        except:
+        except Exception:
             return False
         return self.title in highlight_corpora
 
@@ -244,19 +245,6 @@ def request_media(self, document, corpus_name):
         '''
         return {'media': None, 'info': None}
 
-    def es_mapping(self):
-        '''
-        Create the ElasticSearch mapping for the fields of this corpus. May be
-        passed to the body of an ElasticSearch index creation request.
-        '''
-        return {
-            'properties': {
-                field.name: field.es_mapping
-                for field in self.fields
-                if field.es_mapping and not field.skip
-            }
-        }
-
     def sources(self, start=datetime.min, end=datetime.max):
         '''
         Obtain source files for the corpus, relevant to the given timespan.
@@ -312,26 +300,31 @@ def __init__(self):
         '''
         self.fields = []
 
+
 class XMLCorpusDefinition(CorpusDefinition, XMLReader):
     '''
     An XMLCorpus is any corpus that extracts its data from XML sources.
     '''
 
+
 class HTMLCorpusDefinition(CorpusDefinition, HTMLReader):
     '''
     An HTMLCorpus is any corpus that extracts its data from HTML sources.
     '''
 
+
 class CSVCorpusDefinition(CorpusDefinition, CSVReader):
     '''
     An CSVCorpus is any corpus that extracts its data from CSV sources.
     '''
 
+
 class XLSXCorpusDefinition(CorpusDefinition, XLSXReader):
     '''
     An CSVCorpus is any corpus that extracts its data from an XLSX spreadsheet.
     '''
 
+
 class JSONCorpusDefinition(CorpusDefinition):
     '''
     Corpus definition for json encoded data.
@@ -341,14 +334,15 @@ def source2dicts(self, source, *nargs, **kwargs):
         self._reject_extractors(extract.XML, extract.CSV)
 
         field_dict = {
-           field.name: field.extractor.apply(source, *nargs, **kwargs)
+            field.name: field.extractor.apply(source, *nargs, **kwargs)
             for field in self.fields
         }
 
         yield field_dict
 
 # Fields ######################################################################
 
+
 class FieldDefinition(Field):
     '''
     Definition for a single field in a corpus.
@@ -443,17 +437,16 @@ def __init__(self,
         self.language = language
         self.hidden = not indexed or hidden
 
-        self.sortable = sortable if sortable != None else \
+        self.sortable = sortable if sortable is not None else \
             not hidden and indexed and \
             mapping_type in ['integer', 'float', 'date']
 
-
         # Fields are searchable if they are not hidden and if they are mapped as 'text'.
         # Keyword fields without a filter are also searchable.
-        self.searchable = searchable if searchable != None else \
+        self.searchable = searchable if searchable is not None else \
             not hidden and indexed and \
             ((mapping_type == 'text') or
-             (mapping_type == 'keyword' and self.search_filter == None))
+             (mapping_type == 'keyword' and self.search_filter is None))
         # Add back reference to field in filter
         self.downloadable = downloadable
 

diff --git a/backend/addcorpus/python_corpora/save_corpus.py b/backend/addcorpus/python_corpora/save_corpus.py
@@ -153,7 +153,8 @@ def _save_corpus_documentation(corpus_definition: CorpusDefinition, configuratio
             if pages.exists():
                 pages.delete()
 
-def _prepare_for_import(corpus):
+
+def _prepare_for_import(corpus: Corpus):
     corpus.has_python_definition = True
     corpus.active = False
     corpus.save()
@@ -191,7 +192,7 @@ def _save_or_skip_corpus(corpus_name, corpus_definition, verbose=False, stdout=s
             _save_corpus_configuration(corpus, corpus_definition)
             _activate_if_ready(corpus)
         if verbose:
-            print(f'Saved corpus: {corpus_name}',  file=stdout)
+            print(f'Saved corpus: {corpus_name}', file=stdout)
     except Exception as e:
         print(f'Failed saving corpus: {corpus_name}', file=stderr)
         print(f'Error: {e}', file=stderr)

diff --git a/backend/addcorpus/reader.py b/backend/addcorpus/reader.py
@@ -1,12 +1,14 @@
 import glob
 
-from addcorpus.models import Corpus, Field
-from addcorpus.python_corpora.load_corpus import load_corpus_definition
 from ianalyzer_readers.extract import CSV
 from ianalyzer_readers.readers.core import Field as ReaderField
 from ianalyzer_readers.readers.core import Reader
 from ianalyzer_readers.readers.csv import CSVReader
 
+from addcorpus.models import Corpus, Field
+from addcorpus.python_corpora.load_corpus import load_corpus_definition
+from addcorpus.validation.indexing import validate_has_data_directory
+
 
 def make_reader_field(corpus_field: Field) -> ReaderField:
     return ReaderField(
@@ -25,6 +27,8 @@ def make_reader(corpus: Corpus) -> Reader:
     if corpus.has_python_definition:
         return load_corpus_definition(corpus.name)
 
+    validate_has_data_directory(corpus)
+
     class NewReader(CSVReader):
         data_directory = corpus.configuration.data_directory
         delimiter = corpus.configuration.source_data_delimiter