Skip to content

Commit

Permalink
Merge branch 'develop' into feature/netherlands-citation-page
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed May 29, 2024
2 parents 8d67958 + b15c100 commit 69b5339
Show file tree
Hide file tree
Showing 89 changed files with 1,869 additions and 1,012 deletions.
3 changes: 3 additions & 0 deletions DockerfileElastic
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2

RUN bin/elasticsearch-plugin install mapper-annotated-text
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ For corpora included in I-analyzer, the backend includes a definition file that

## Usage

If you are interested in using I-analyzer, the most straightforward way to get started is to make an account at [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament (not publicly accessible)](https://people-and-parliament.hum.uu.nl/).
If you are interested in using I-analyzer, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).

I-analyzer does not have an "upload data" option (yet!). If you are interested in using I-analyzer as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:

Expand Down
4 changes: 4 additions & 0 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from typing import Dict
from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available

def primary_mapping_type(es_mapping: Dict) -> str:
return es_mapping.get('type', None)

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
'''
Mapping for the main content field. Options:
Expand Down
147 changes: 147 additions & 0 deletions backend/addcorpus/json_corpora/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import pytest

@pytest.fixture()
def content_field_json():
return {
'name': 'content',
'display_name': 'Content',
'description': 'Bla bla bla',
'type': 'text_content',
'language': 'en',
'options': {
'search': True,
'filter': 'none',
'preview': True,
'visualize': True,
'sort': False,
'hidden': False
},
'extract': {'column': 'content'}
}

@pytest.fixture()
def keyword_field_json():
return {
'name': 'author',
'display_name': 'Author',
'description': 'Author of the text',
'type': 'text_metadata',
'options': {
'search': True,
'filter': 'show',
'preview': True,
'visualize': True,
'sort': False,
'hidden': False
},
'extract': {'column': 'author'}
}

@pytest.fixture()
def int_field_json():
return {
'name': 'year',
'display_name': 'Year',
'description': 'Year in which the text was written',
'type': 'integer',
'options': {
'search': False,
'filter': 'show',
'preview': False,
'visualize': True,
'sort': True,
'hidden': False
},
'extract': {'column': 'year'}
}

@pytest.fixture()
def float_field_json():
return {
'name': 'ocr_confidence',
'display_name': 'OCR confidence',
'description': 'Confidence level of optical character recognition output',
'type': 'float',
'options': {
'search': False,
'filter': 'hide',
'preview': False,
'visualize': False,
'sort': False,
'hidden': False
},
'extract': {'column': 'ocr'}
}

@pytest.fixture()
def date_field_json():
return {
'name': 'date',
'display_name': 'Date',
'description': 'Date on which the text was written',
'type': 'date',
'options': {
'search': False,
'filter': 'show',
'preview': True,
'visualize': True,
'sort': True,
'hidden': False
},
'extract': {'column': 'date'}
}

@pytest.fixture()
def boolean_field_json():
return {
'name': 'author_known',
'display_name': 'Author known',
'description': 'Whether the author of the text is known',
'type': 'boolean',
'options': {
'search': False,
'filter': 'show',
'preview': False,
'visualize': True,
'sort': False,
'hidden': False
},
'extract': {'column': 'author_known'}
}

@pytest.fixture()
def geo_field_json():
return {
'name': 'location',
'display_name': 'Location',
'description': 'Location where the text was published',
'type': 'geo_point',
'options': {
'search': False,
'filter': 'none',
'preview': False,
'visualize': False,
'sort': False,
'hidden': False
},
'extract': {'column': 'location'}
}

@pytest.fixture(
params=['content', 'keyword', 'int', 'float', 'date', 'boolean', 'geo']
)
def any_field_json(
request, content_field_json, keyword_field_json, int_field_json, float_field_json,
date_field_json, boolean_field_json, geo_field_json
):
field_type = request.param
funcs = {
'content': content_field_json,
'keyword': keyword_field_json,
'int': int_field_json,
'float': float_field_json,
'date': date_field_json,
'boolean': boolean_field_json,
'geo': geo_field_json,
}
return funcs[field_type]
2 changes: 2 additions & 0 deletions backend/addcorpus/json_corpora/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DEFAULT_CSV_DELIMITER = ','
DATE_FORMAT = '%Y-%m-%d'
95 changes: 95 additions & 0 deletions backend/addcorpus/json_corpora/export_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from typing import Dict
from datetime import date
from addcorpus.models import Corpus, CorpusConfiguration, Field
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
from addcorpus.es_mappings import primary_mapping_type

def export_json_corpus(corpus: Corpus) -> Dict:
config = corpus.configuration
data = {'name': corpus.name, 'id': corpus.pk }
data['meta'] = export_corpus_meta(config)
data['source_data'] = export_corpus_source_data(config)
options = export_corpus_options(config)
if options:
data['options'] = options
data['fields'] = [
export_json_field(field) for field in config.fields.all()
]
return data

def export_corpus_meta(configuration: CorpusConfiguration) -> Dict:
return {
'title': configuration.title,
'category': configuration.category,
'description': configuration.description,
'languages': configuration.languages,
'date_range': {
'min': export_date(configuration.min_date),
'max': export_date(configuration.max_date),
}
}

def export_date(date: date):
return date.strftime(DATE_FORMAT)

def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict:
data = {
'type': 'csv'
}
if configuration.source_data_delimiter != DEFAULT_CSV_DELIMITER:
data['options'] = {'delimiter': configuration.source_data_delimiter}
return data

def export_corpus_options(configuration: CorpusConfiguration) -> Dict:
data = {}
if configuration.document_context:
data['document_context'] = configuration.document_context
if configuration.default_sort:
data['default_sort'] = configuration.default_sort
if configuration.language_field:
data['language_field'] = configuration.language_field
return data


def export_json_field(field: Field) -> Dict:
data = {
'name': field.name,
'display_name': field.display_name,
'description': field.description,
'type': export_field_type(field),
'options': export_field_options(field),
'extract': export_field_extract(field)
}
if field.language:
data['language'] = field.language
return data


def export_field_type(field: Field) -> str:
if field.display_type == 'text' or field.display_type == 'keyword':
return 'text_metadata'
return field.display_type


def export_field_options(field: Field) -> Dict:
return {
'filter': export_field_filter(field),
'hidden': field.hidden,
'preview': field.results_overview,
'search': field.searchable,
'sort': field.sortable,
'visualize': len(field.visualizations) > 0
}


def export_field_filter(field: Field) -> str:
if field.search_filter != {}:
return 'show'
filterable_mappings = ['keyword', 'int', 'float', 'date', 'boolean']
if primary_mapping_type(field.es_mapping) in filterable_mappings and field.display_type != 'url':
return 'hide'
return 'none'


def export_field_extract(field: Field) -> Dict:
return {'column': field.extract_column}
Loading

0 comments on commit 69b5339

Please sign in to comment.