Merge branch 'develop' into feature/netherlands-citation-page

UUDigitalHumanitieslab · May 29, 2024 · 69b5339 · 69b5339
2 parents 8d67958 + b15c100
commit 69b5339
Show file tree

Hide file tree

Showing 89 changed files with 1,869 additions and 1,012 deletions.
diff --git a/DockerfileElastic b/DockerfileElastic
@@ -0,0 +1,3 @@
+FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2
+
+RUN bin/elasticsearch-plugin install mapper-annotated-text
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ For corpora included in I-analyzer, the backend includes a definition file that
 
 ## Usage
 
-If you are interested in using I-analyzer, the most straightforward way to get started is to make an account at [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament  (not publicly accessible)](https://people-and-parliament.hum.uu.nl/).
+If you are interested in using I-analyzer, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
 
 I-analyzer does not have an "upload data" option (yet!). If you are interested in using I-analyzer as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
 

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,5 +1,9 @@
+from typing import Dict
 from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
 
+def primary_mapping_type(es_mapping: Dict) -> str:
+    return es_mapping.get('type', None)
+
 def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:

diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py
@@ -0,0 +1,147 @@
+import pytest
+
+@pytest.fixture()
+def content_field_json():
+    return {
+        'name': 'content',
+        'display_name': 'Content',
+        'description': 'Bla bla bla',
+        'type': 'text_content',
+        'language': 'en',
+        'options': {
+            'search': True,
+            'filter': 'none',
+            'preview': True,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'content'}
+    }
+
+@pytest.fixture()
+def keyword_field_json():
+    return {
+        'name': 'author',
+        'display_name': 'Author',
+        'description': 'Author of the text',
+        'type': 'text_metadata',
+        'options': {
+            'search': True,
+            'filter': 'show',
+            'preview': True,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'author'}
+    }
+
+@pytest.fixture()
+def int_field_json():
+    return {
+        'name': 'year',
+        'display_name': 'Year',
+        'description': 'Year in which the text was written',
+        'type': 'integer',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': False,
+            'visualize': True,
+            'sort': True,
+            'hidden': False
+        },
+        'extract': {'column': 'year'}
+    }
+
+@pytest.fixture()
+def float_field_json():
+    return {
+        'name': 'ocr_confidence',
+        'display_name': 'OCR confidence',
+        'description': 'Confidence level of optical character recognition output',
+        'type': 'float',
+        'options': {
+            'search': False,
+            'filter': 'hide',
+            'preview': False,
+            'visualize': False,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'ocr'}
+    }
+
+@pytest.fixture()
+def date_field_json():
+    return {
+        'name': 'date',
+        'display_name': 'Date',
+        'description': 'Date on which the text was written',
+        'type': 'date',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': True,
+            'visualize': True,
+            'sort': True,
+            'hidden': False
+        },
+        'extract': {'column': 'date'}
+    }
+
+@pytest.fixture()
+def boolean_field_json():
+    return {
+        'name': 'author_known',
+        'display_name': 'Author known',
+        'description': 'Whether the author of the text is known',
+        'type': 'boolean',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': False,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'author_known'}
+    }
+
+@pytest.fixture()
+def geo_field_json():
+    return {
+        'name': 'location',
+        'display_name': 'Location',
+        'description': 'Location where the text was published',
+        'type': 'geo_point',
+        'options': {
+            'search': False,
+            'filter': 'none',
+            'preview': False,
+            'visualize': False,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'location'}
+    }
+
+@pytest.fixture(
+    params=['content', 'keyword', 'int', 'float', 'date', 'boolean', 'geo']
+)
+def any_field_json(
+    request, content_field_json, keyword_field_json, int_field_json, float_field_json,
+    date_field_json, boolean_field_json, geo_field_json
+):
+    field_type = request.param
+    funcs = {
+        'content': content_field_json,
+        'keyword': keyword_field_json,
+        'int': int_field_json,
+        'float': float_field_json,
+        'date': date_field_json,
+        'boolean': boolean_field_json,
+        'geo': geo_field_json,
+    }
+    return funcs[field_type]
diff --git a/backend/addcorpus/json_corpora/constants.py b/backend/addcorpus/json_corpora/constants.py
@@ -0,0 +1,2 @@
+DEFAULT_CSV_DELIMITER = ','
+DATE_FORMAT = '%Y-%m-%d'
diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py
@@ -0,0 +1,95 @@
+from typing import Dict
+from datetime import date
+from addcorpus.models import Corpus, CorpusConfiguration, Field
+from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
+from addcorpus.es_mappings import primary_mapping_type
+
+def export_json_corpus(corpus: Corpus) -> Dict:
+    config = corpus.configuration
+    data = {'name': corpus.name, 'id': corpus.pk }
+    data['meta'] = export_corpus_meta(config)
+    data['source_data'] = export_corpus_source_data(config)
+    options = export_corpus_options(config)
+    if options:
+        data['options'] = options
+    data['fields'] = [
+        export_json_field(field) for field in config.fields.all()
+    ]
+    return data
+
+def export_corpus_meta(configuration: CorpusConfiguration) -> Dict:
+    return {
+        'title': configuration.title,
+        'category': configuration.category,
+        'description': configuration.description,
+        'languages': configuration.languages,
+        'date_range': {
+            'min': export_date(configuration.min_date),
+            'max': export_date(configuration.max_date),
+        }
+    }
+
+def export_date(date: date):
+    return date.strftime(DATE_FORMAT)
+
+def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict:
+    data = {
+        'type': 'csv'
+    }
+    if configuration.source_data_delimiter != DEFAULT_CSV_DELIMITER:
+        data['options'] = {'delimiter': configuration.source_data_delimiter}
+    return data
+
+def export_corpus_options(configuration: CorpusConfiguration) -> Dict:
+    data = {}
+    if configuration.document_context:
+        data['document_context'] = configuration.document_context
+    if configuration.default_sort:
+        data['default_sort'] = configuration.default_sort
+    if configuration.language_field:
+        data['language_field'] = configuration.language_field
+    return data
+
+
+def export_json_field(field: Field) -> Dict:
+    data = {
+        'name': field.name,
+        'display_name': field.display_name,
+        'description': field.description,
+        'type': export_field_type(field),
+        'options': export_field_options(field),
+        'extract': export_field_extract(field)
+    }
+    if field.language:
+        data['language'] = field.language
+    return data
+
+
+def export_field_type(field: Field) -> str:
+    if field.display_type == 'text' or field.display_type == 'keyword':
+        return 'text_metadata'
+    return field.display_type
+
+
+def export_field_options(field: Field) -> Dict:
+    return {
+        'filter': export_field_filter(field),
+        'hidden': field.hidden,
+        'preview': field.results_overview,
+        'search': field.searchable,
+        'sort': field.sortable,
+        'visualize': len(field.visualizations) > 0
+    }
+
+
+def export_field_filter(field: Field) -> str:
+    if field.search_filter != {}:
+        return 'show'
+    filterable_mappings = ['keyword', 'int', 'float', 'date', 'boolean']
+    if primary_mapping_type(field.es_mapping) in filterable_mappings and field.display_type != 'url':
+        return 'hide'
+    return 'none'
+
+
+def export_field_extract(field: Field) -> Dict:
+    return {'column': field.extract_column}